/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Mark Friedrichs, Mike Houston                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

//Applies a permutation to the given random vector stream
//Totally bandwidth bound and streams are large, should be done infrequently.
//Probably don't want gvin and gvout to be the same stream!

kernel void kpermute_vectors( float gstrwidth, float perm<>, 
		float3 gvin[][], out float3 gvout<> ){
	float2 ind;
	//ind.y = floor( perm / gstrwidth );
   ind.y = round( (perm - fmod( perm, gstrwidth ))/gstrwidth );
	ind.x = perm - ind.y * gstrwidth;

	gvout = gvin[ ind ];
}

/* Stochastic Integrator
 *
 * Since we don't have random numbers on the GPU, we 
 * precalculate random numbers from a normal distribution
 * and store them in several large textures. Periodically
 * these will have to be refreshed on the CPU.
 *
 * Doing gathers instead of streams for the random vectors, 
 * but the gathers should be fully coherent anyway. 
 * This is just to avoid some
 * painful manipulations with multiple streams and domains.  
 * */


/* First part before first constraint call 
 *
 *
 * Many things can be precalculated here
 * If we become compute bound (wonder how), we can precalc stuff
 *
 * Assuming constant temperature. If you want to change the 
 * temperature, you have to change all the precalculated 
 * constants that use it.
 * */

/*
 * Alternative formulation to handle precision better
 * In updatesd1, we output only deltaV's. The shake
 * kernel uses the deltaV's and the X's to output 
 * a bunch of constrained deltax's
 * */


kernel void kupdate_sd1_fix1(
		float xstrwidth,   //atom stream width
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers

		//sd constants, prefixed with c 
		//to avoid 1 letter names, cxx = sdc[0].xx
		float cem,

		//sd precalculated constants
		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
		
		//sdpc = sd precalculated per atom values
		//sdpc.x = 1/sqrt(m)*sig[0].Yv
		//sdpc.y = 1/sqrt(m)*sig[0].V
		float2 sdpc<>,   
		
		//Normally distributed random vectors
		float3 fgauss[][],
	
		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
		float3 posq<>,
		float3 f<>,
		float3 v<>,

		float invmass<>,
		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
		out float3 vnew<>,
		out float3 posqp<>
		){
   float  c1;
	float3 Vmh;
	float3 fg1, fg2;
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index

	igauss         = indexof( posq ).xy;
	linind_gauss   = igauss.y * xstrwidth + igauss.x;

	//2 random vectors used in each fragment

	linind_gauss   = 2.0f * linind_gauss + goffset; 	
   igauss.y       = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
	igauss.x       = linind_gauss - igauss.y * gstrwidth;
	fg1            = fgauss[ igauss ];

	linind_gauss  += 1.0f;
   igauss.y       = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
	igauss.x       = linind_gauss - igauss.y * gstrwidth;
	fg2            = fgauss[ igauss ];

	//Vmh            = (sd2X*pc3) + (sdpc.x*fg1);

	Vmh.x          = pc3*sd2X.x + sdpc.x*fg1.x;
	Vmh.y          = pc3*sd2X.y + sdpc.x*fg1.y;
	Vmh.z          = pc3*sd2X.z + sdpc.x*fg1.z;

	//sd1V           = sdpc.y*fg2;
   sd1V.x         = sdpc.y*fg2.x;
   sd1V.y         = sdpc.y*fg2.y;
   sd1V.z         = sdpc.y*fg2.z;
  
	//vnew           = (v*cem) + (invmass*f*pc1) + sd1V - (Vmh*cem);
   c1             = invmass*pc1;
   vnew.x         = cem*(v.x - Vmh.x) + c1*f.x + sd1V.x;
   vnew.y         = cem*(v.y - Vmh.y) + c1*f.y + sd1V.y;
   vnew.z         = cem*(v.z - Vmh.z) + c1*f.z + sd1V.z;

	posqp.x        = pc2*vnew.x;
	posqp.y        = pc2*vnew.y;
	posqp.z        = pc2*vnew.z;

}

kernel void kupdate_sd2_fix1(
		float xstrwidth,   //stream width of atoms
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers
		
		float pc1,          //this is 1/pc2 for sd1 
		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
		

		//per atom precalcs
		//sdpc.x = 1/sqrt(m)*sig[0].Yx
		//sdpc.y = 1/sqrt(m)*sig[0].X
		float2 sdpc<>,

		//Normally distributed random vectors
		float3 fgauss[][],
		
		float3 sd1V<>,      //calculated in sd1
		float3 posq<>,      //positions pre-update
		float3 posqp<>,     //deltas post constraint
		float3 vnew<>,      //velocities after sd1
		out float3 sd2X<>,  //Used in sd1
		out float3 v<>,     //velocities corrected for constraints
		out float3 posqp2<> //new deltas, need to be constrained again
		){

	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index
	float3 fg1, fg2;
	float3 Xmh;

	igauss            = indexof( posq ).xy;
	linind_gauss      = igauss.y * xstrwidth + igauss.x;
	linind_gauss      = 2.0f * linind_gauss + goffset; 	
	
   igauss.y          = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
	igauss.x          = linind_gauss - igauss.y * gstrwidth;
	fg1               = fgauss[ igauss ];

	linind_gauss     += 1.0f;
   igauss.y          = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
	igauss.x          = linind_gauss - igauss.y * gstrwidth;
	fg2               = fgauss[ igauss ];
			
	v.x               = pc1*posqp.x;
	v.y               = pc1*posqp.y;
	v.z               = pc1*posqp.z;

	//Xmh               = sd1V * pc2 + sdpc.x * fg1;
	Xmh.x             = pc2*sd1V.x + sdpc.x*fg1.x;
	Xmh.y             = pc2*sd1V.y + sdpc.x*fg1.y;
	Xmh.z             = pc2*sd1V.z + sdpc.x*fg1.z;

	//sd2X              = sdpc.y * fg2;
	sd2X.x            = sdpc.y*fg2.x;
	sd2X.y            = sdpc.y*fg2.y;
	sd2X.z            = sdpc.y*fg2.z;

	posqp2            = posqp + sd2X - Xmh;
}