/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/


/* Stochastic Integrator
 *
 * Since we don't have random numbers on the GPU, we 
 * precalculate random numbers from a normal distribution
 * and store them in several large textures. Periodically
 * these will have to be refreshed on the CPU.
 *
 * Doing gathers instead of streams for the random vectors, 
 * but the gathers should be fully coherent anyway. 
 * This is just to avoid some
 * painful manipulations with multiple streams and domains.  
 * */


/* First part before first constraint call 
 *
 *
 * Many things can be precalculated here
 * If we become compute bound (wonder how), we can precalc stuff
 *
 * Assuming constant temperature. If you want to change the 
 * temperature, you have to change all the precalculated 
 * constants that use it.
 * */

/*
kernel void kupdate_sd1(
		float xstrwidth,   //atom stream width
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers

		//sd constants, prefixed with c 
		//to avoid 1 letter names, cxx = sdc[0].xx
		float cem,

		//sd precalculated constants
		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
		
		//sdpc = sd precalculated per atom values
		//sdpc.x = 1/sqrt(m)*sig[0].Yv
		//sdpc.y = 1/sqrt(m)*sig[0].V
		float2 sdpc<>,   
		
		//Normally distributed random vectors
		float3 fgauss[][],
	
		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
		float4 posq<>,
		float3 f<>,
		float3 v<>,

		float invmass<>,
		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
		out float3 vnew<>,
		out float4 posqp<>
		){
	float3 Vmh;
	float3 fg1, fg2;
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index

	igauss = indexof( posq );
	linind_gauss = igauss.y * xstrwidth + igauss.x;
	//2 random vectors used in each fragment
	linind_gauss = 2 * linind_gauss + goffset; 	
	
	igauss.y = floor( linind_gauss / gstrwidth );
	igauss.x = linind_gauss - igauss.y * gstrwidth;

	fg1 = fgauss[ igauss ];

	linind_gauss += 1.0f;
	igauss.y = floor( linind_gauss / gstrwidth );
	igauss.x = linind_gauss - igauss.y * gstrwidth;

	fg2 = fgauss[ igauss ];
	
	Vmh  = sd2X * pc3 + sdpc.x * fg1;
	sd1V = sdpc.y * fg2;
	
	vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
	
	posqp.w = posq.w;
	posqp.xyz = posq.xyz + vnew * pc2;
	
} */

/*Second part of sd update, after first constraints call*/
/*
kernel void kupdate_sd2(
		float xstrwidth,   //stream width of atoms
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers
		
		float pc1,          //this is 1/pc2 for sd1 
		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
		

		//per atom precalcs
		//sdpc.x = 1/sqrt(m)*sig[0].Yx
		//sdpc.y = 1/sqrt(m)*sig[0].X
		float2 sdpc<>,

		//Normally distributed random vectors
		float3 fgauss[][],
		
		float3 sd1V<>,      //calculated in sd1
		float4 posq<>,      //positions pre-update
		float4 posqp<>,     //positions post-constraint
		float3 vnew<>,      //velocities after sd1
		out float3 sd2X<>,  //Used in sd1
		out float3 v<>,     //velocities corrected for constraints
		out float4 posqp2<> //Will need to be constrained again
		){
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index
	float3 fg1, fg2;
	float3 Xmh;

	igauss = indexof( posq );
	linind_gauss = igauss.y * xstrwidth + igauss.x;
	linind_gauss = 2 * linind_gauss + goffset; 	
	
	igauss.y = floor( linind_gauss / gstrwidth );
	igauss.x = linind_gauss - igauss.y * gstrwidth;
	fg1 = fgauss[ igauss ];

	linind_gauss += 1.0f;
	igauss.y = floor( linind_gauss / gstrwidth );
	igauss.x = linind_gauss - igauss.y * gstrwidth;
	fg2 = fgauss[ igauss ];
			
	v = pc1 * ( posqp.xyz - posq.xyz );
	Xmh = sd1V * pc2 + sdpc.x * fg1;
	
	sd2X = sdpc.y * fg2;

	posqp2 = posqp;
	posqp2.xyz += sd2X - Xmh;
} */

//Applies a permutation to the given random vector stream
//Totally bandwidth bound and streams are large, should be done infrequently.
//Probably don't want gvin and gvout to be the same stream!
kernel void kpermute_vectors( float gstrwidth, float perm<>, 
		float3 gvin[][], out float3 gvout<> )
{
	float2 ind;
	//ind.y = floor( perm / gstrwidth );
   ind.y = round( (perm - fmod( perm, gstrwidth ))/gstrwidth );
	ind.x = perm - ind.y * gstrwidth;

	gvout = gvin[ ind ];
}

/*
 * Alternative formulation to handle precision better
 * In updatesd1, we output only deltaV's. The shake
 * kernel uses the deltaV's and the X's to output 
 * a bunch of constrained deltax's
 * */


kernel void kupdate_sd1_fix1(
		float xstrwidth,   //atom stream width
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers

		//sd constants, prefixed with c 
		//to avoid 1 letter names, cxx = sdc[0].xx
		float cem,

		//sd precalculated constants
		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
		
		//sdpc = sd precalculated per atom values
		//sdpc.x = 1/sqrt(m)*sig[0].Yv
		//sdpc.y = 1/sqrt(m)*sig[0].V
		float2 sdpc<>,   
		
		//Normally distributed random vectors
		float3 fgauss[][],
	
		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
		float4 posq<>,
		float3 f<>,
		float3 v<>,

		float invmass<>,
		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
		out float3 vnew<>,
		out float4 posqp<>
		){
	float3 Vmh;
	float3 fg1, fg2;
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index

	igauss = indexof( posq );
	linind_gauss = igauss.y * xstrwidth + igauss.x;
	//2 random vectors used in each fragment
	linind_gauss = 2 * linind_gauss + goffset; 	
	
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
	igauss.x = linind_gauss - igauss.y * gstrwidth;

	fg1 = fgauss[ igauss ];

	linind_gauss += 1.0f;
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
	igauss.x = linind_gauss - igauss.y * gstrwidth;

	fg2 = fgauss[ igauss ];
	
   /* 
      Gromacs code
    n               = 1 -> ngtc

    sdc[n].gdt      = dt/tau_t[n];
    sdc[n].eph      = exp(sdc[n].gdt/2);
    sdc[n].emh      = exp(-sdc[n].gdt/2);
    sdc[n].em       = exp(-sdc[n].gdt);

    sdc[n].b        = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);

    // 2.15 in paper (C)
    sdc[n].c        = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;

    sdc[n].d        = 2 - sdc[n].eph - sdc[n].emh;

      Vmh           = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);

      V[n-start][d] = ism*sig[gt].V*fgauss(&jran);

      v[n][d]       = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
                      V[n-start][d] - sdc[gt].em*Vmh;
           
      xprime[n][d]  = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh); 
  
   */

// ---------------------------------------------------------------------------------------------
	// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
   //sdpc.x = 1/sqrt(m)*sig[0].Yv
	Vmh  = sd2X * pc3 + sdpc.x * fg1;
// ---------------------------------------------------------------------------------------------

// ---------------------------------------------------------------------------------------------
	//sdpc.y = 1/sqrt(m)*sig[0].V
	sd1V = sdpc.y * fg2;
// ---------------------------------------------------------------------------------------------
	

// ---------------------------------------------------------------------------------------------
   // 
   // float pc1 = tau_t[0] * ( 1 - sdc[0].em )
	vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
	
   // float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
	posqp       = posq;
	posqp.xyz  += vnew * pc2;
}

kernel void kupdate_sd2_fix1(
		float xstrwidth,   //stream width of atoms
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers
		
		float pc1,          //this is 1/pc2 for sd1 
		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
		

		//per atom precalcs
		//sdpc.x = 1/sqrt(m)*sig[0].Yx
		//sdpc.y = 1/sqrt(m)*sig[0].X
		float2 sdpc<>,

		//Normally distributed random vectors
		float3 fgauss[][],
		
		float3 sd1V<>,      //calculated in sd1
		float4 posq<>,      //positions pre-update
		float4 posqp<>,     //deltas post constraint
		float3 vnew<>,      //velocities after sd1
		out float3 sd2X<>,  //Used in sd1
		out float3 v<>,     //velocities corrected for constraints
		out float4 posqp2<> //new deltas, need to be constrained again
		){
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index
	float3 fg1, fg2;
	float3 Xmh;

	igauss = indexof( posq );
	linind_gauss = igauss.y * xstrwidth + igauss.x;
	linind_gauss = 2 * linind_gauss + goffset; 	
	
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
	igauss.x = linind_gauss - igauss.y * gstrwidth;
	fg1 = fgauss[ igauss ];

	linind_gauss += 1.0f;
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
	igauss.x = linind_gauss - igauss.y * gstrwidth;
	fg2 = fgauss[ igauss ];
			
	//v   = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
	v   = pc1 * (posqp.xyz - posq.xyz);
	Xmh = sd1V * pc2 + sdpc.x * fg1;
	
	sd2X = sdpc.y * fg2;

	posqp2 = posqp;
	posqp2.xyz += sd2X - Xmh;
}


kernel void kupdate_sd1_fix1_FixedRV(
		float xstrwidth,   //atom stream width
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers

		//sd constants, prefixed with c 
		//to avoid 1 letter names, cxx = sdc[0].xx
		float cem,

		//sd precalculated constants
		float pc1,  //tau_t[0] * ( 1 - sdc[0].em )
		float pc2,  //tau_t[gt] * (sdc[gt].eph - sdc.emh)
		float pc3,  //sdc[0].d/(tau_t * sdc[0].c);
		
		//sdpc = sd precalculated per atom values
		//sdpc.x = 1/sqrt(m)*sig[0].Yv
		//sdpc.y = 1/sqrt(m)*sig[0].V
		float2 sdpc<>,   
		
		//Normally distributed random vectors
		float3 fgauss[][],
	
		float3 sd2X<>,  //This is calculated in sd2 of the previous step	
		float4 posq<>,
		float3 f<>,
		float3 v<>,

		float invmass<>,
		out float3 sd1V<>, //This is V in gromacs, reused in sd2  
		out float3 vnew<>,
		out float4 posqp<>
		){
	float3 Vmh;
	float3 fg1, fg2;
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index
   float RandomValueVsp = 0.1f;

	igauss = indexof( posq );
	linind_gauss = igauss.y * xstrwidth + igauss.x;
	//2 random vectors used in each fragment
	linind_gauss = 2 * linind_gauss + goffset; 	
	
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
	igauss.x = linind_gauss - igauss.y * gstrwidth;

	//fg1 = fgauss[ igauss ];
   fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );

	linind_gauss += 1.0f;
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth);
	igauss.x = linind_gauss - igauss.y * gstrwidth;

	//fg2 = fgauss[ igauss ];
   fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
	
   /* 
      Gromacs code
    n               = 1 -> ngtc

    sdc[n].gdt      = dt/tau_t[n];
    sdc[n].eph      = exp(sdc[n].gdt/2);
    sdc[n].emh      = exp(-sdc[n].gdt/2);
    sdc[n].em       = exp(-sdc[n].gdt);

    sdc[n].b        = sdc[n].gdt*(sqr(sdc[n].eph)-1) - 4*sqr(sdc[n].eph-1);

    // 2.15 in paper (C)
    sdc[n].c        = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;

    sdc[n].d        = 2 - sdc[n].eph - sdc[n].emh;

      Vmh           = X[n-start][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c) + ism*sig[gt].Yv*fgauss(&jran);

      V[n-start][d] = ism*sig[gt].V*fgauss(&jran);

      v[n][d]       = vn*sdc[gt].em + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em) +
                      V[n-start][d] - sdc[gt].em*Vmh;
           
      xprime[n][d]  = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh); 
  
   */

// ---------------------------------------------------------------------------------------------
	// float pc3 = sdc[0].d/(tau_t * sdc[0].c);
   //sdpc.x = 1/sqrt(m)*sig[0].Yv
	Vmh  = sd2X * pc3 + sdpc.x * fg1;
// ---------------------------------------------------------------------------------------------

// ---------------------------------------------------------------------------------------------
	//sdpc.y = 1/sqrt(m)*sig[0].V
	sd1V = sdpc.y * fg2;
// ---------------------------------------------------------------------------------------------
	

// ---------------------------------------------------------------------------------------------
   // 
   // float pc1 = tau_t[0] * ( 1 - sdc[0].em )
	vnew = v * cem + invmass * f * pc1 + sd1V - Vmh * cem;
	
   // float pc2 = tau_t[gt] * (sdc[gt].eph - sdc.emh)
	posqp       = posq;
	posqp.xyz  += vnew * pc2;
}

kernel void kupdate_sd2_fix1_FixedRV(
		float xstrwidth,   //stream width of atoms
		float gstrwidth,   //stream width of random numbers
		float goffset,     //starting offset for random numbers
		
		float pc1,          //this is 1/pc2 for sd1 
		                    //= 1/(tau_t*(sdc[0].eph-sdc[0].emh))
		float pc2,          //= tau_t * sdc[0].d/(sdc[0].em - 1)
		

		//per atom precalcs
		//sdpc.x = 1/sqrt(m)*sig[0].Yx
		//sdpc.y = 1/sqrt(m)*sig[0].X
		float2 sdpc<>,

		//Normally distributed random vectors
		float3 fgauss[][],
		
		float3 sd1V<>,      //calculated in sd1
		float4 posq<>,      //positions pre-update
		float4 posqp<>,     //deltas post constraint
		float3 vnew<>,      //velocities after sd1
		out float3 sd2X<>,  //Used in sd1
		out float3 v<>,     //velocities corrected for constraints
		out float4 posqp2<> //new deltas, need to be constrained again
		){
	float linind_gauss; //linear index into the random numbers
	float2 igauss;      //2d index
	float3 fg1, fg2;
	float3 Xmh;
   float RandomValueVsp = 0.1f;

	igauss = indexof( posq );
	linind_gauss = igauss.y * xstrwidth + igauss.x;
	linind_gauss = 2 * linind_gauss + goffset; 	
	
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
	igauss.x = linind_gauss - igauss.y * gstrwidth;
	//fg1 = fgauss[ igauss ];
   fg1 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );

	linind_gauss += 1.0f;
	//igauss.y = floor( linind_gauss / gstrwidth );
   igauss.y = round( (linind_gauss - fmod( linind_gauss, gstrwidth))/gstrwidth); 
	igauss.x = linind_gauss - igauss.y * gstrwidth;
	// fg2 = fgauss[ igauss ];
   fg2 = float3( RandomValueVsp, RandomValueVsp, RandomValueVsp );
			
	//v   = pc1 * posqp.xyz (!!!!!!!! ?????? !!!!!!!)
	v   = pc1 * (posqp.xyz - posq.xyz);
	Xmh = sd1V * pc2 + sdpc.x * fg1;
	
	sd2X = sdpc.y * fg2;

	posqp2 = posqp;
	posqp2.xyz += sd2X - Xmh;
}