#include "hip/hip_runtime.h"
#include <math.h>
#include "parameters.h"
#include "cuda_commen.h"
#include "commen_kernel.h"

#include "parameters_d.h"
#include "OCFD_warp_shuffle.h"
#include "cuda_utility.h"
#include "OCFD_Schemes_hybrid_auto.h"
#include "OCFD_Schemes_Choose.h"
#include "OCFD_bound_Scheme.h"
#include "OCFD_Schemes.h"
#include "OCFD_mpi_dev.h"
#include "OCFD_mpi.h"
#include "OCFD_IO_mpi.h"

#ifdef __cplusplus 
extern "C"{
#endif

void Set_Scheme_HybridAuto(hipStream_t *stream){
    Comput_P(pd_d, pT_d, pP_d, stream);

    if(HybridAuto.Style == 1){

        Comput_grad(pP_d, stream);

        modify_NT(stream);

        if(HybridAuto.IF_Smooth_dp == 1) Smoothing_dp(stream);

        Patch_zones(stream);

        Boundary_dp(stream);

        Comput_Scheme_point(stream);

    }else if(HybridAuto.Style == 2){

        Comput_Scheme_point_Jameson(stream);

    }

}


//---------------------------------------------------Comput_P--------------------------------------------------------
__global__ void Comput_P_kernel(cudaField d, cudaField T, cudaField P, REAL p00, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(P, x, y, z) = p00 * get_Field_LAP(d, x, y, z) * get_Field_LAP(T, x, y, z);
    }
}

void Comput_P(cudaField *d, cudaField *T, cudaField *P, hipStream_t *stream){
    cudaJobPackage job(dim3(0, 0, 0) , dim3(nx+2*LAP, ny+2*LAP, nz+2*LAP));

    dim3 size, griddim, blockdim;
	jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

    REAL p00 = 1.0/(Gamma*Ama*Ama);
    
    CUDA_LAUNCH((hipLaunchKernelGGL(Comput_P_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *d, *T, *P, p00, job)));
}
//--------------------------------------------------------------------------------------------------------------------


//----------------------------------------------------Comput_grad_P----------------------------------------------------
__device__ REAL warpReduce(REAL mySum){
    mySum += __shfl_xor_double(mySum, 32, hipWarpSize);
    mySum += __shfl_xor_double(mySum, 16, hipWarpSize);
    mySum += __shfl_xor_double(mySum,  8, hipWarpSize);
    mySum += __shfl_xor_double(mySum,  4, hipWarpSize);
    mySum += __shfl_xor_double(mySum,  2, hipWarpSize);
    mySum += __shfl_xor_double(mySum,  1, hipWarpSize);
    return mySum;
}

__global__ void Comput_grad1_kernel(cudaField pk, cudaField pi, cudaField ps, cudaField Akx, cudaField Aky, 
    cudaField Akz, cudaField Aix, cudaField Aiy, cudaField Aiz, cudaField Asx, cudaField Asy, cudaField Asz, 
    int SMEMDIM, cudaField grad_f, REAL *g_odata, cudaJobPackage job){
    HIP_DYNAMIC_SHARED( REAL, shared)
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int Id = blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
    unsigned int warpId  = Id / hipWarpSize;
    unsigned int laneIdx = Id % hipWarpSize;
    REAL grad_f0 = 0.;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        REAL px = get_Field(pk, x, y, z) * get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP) 
                + get_Field(pi, x, y, z) * get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)
                + get_Field(ps, x, y, z) * get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);

        REAL py = get_Field(pk, x, y, z) * get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP) 
                + get_Field(pi, x, y, z) * get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)
                + get_Field(ps, x, y, z) * get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);

        REAL pz = get_Field(pk, x, y, z) * get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP) 
                + get_Field(pi, x, y, z) * get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)
                + get_Field(ps, x, y, z) * get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);

        get_Field(grad_f, x, y, z) = grad_f0 = sqrt(px*px + py*py + pz*pz);
    }

    grad_f0 = warpReduce(grad_f0);

    if(laneIdx == 0) shared[warpId] = grad_f0;
    __syncthreads();

    grad_f0 = (Id < SMEMDIM)?shared[Id]:0;

    if(warpId == 0) grad_f0 = warpReduce(grad_f0);
    if(Id == 0) g_odata[blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z] = grad_f0;
}

__global__ void add_kernel(REAL *g_odata, int g_odata_size){
    HIP_DYNAMIC_SHARED( REAL, shared)
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int warpId  = threadIdx.x / hipWarpSize;
    unsigned int laneIdx = threadIdx.x % hipWarpSize;

    REAL grad_f0 = 0.;
    if(x < g_odata_size) grad_f0 = g_odata[x];

    grad_f0 = warpReduce(grad_f0);
    if(laneIdx == 0) shared[warpId] = grad_f0;
    __syncthreads();

    grad_f0 = (threadIdx.x < 8)?shared[laneIdx]:0;

    if(warpId == 0) grad_f0 = warpReduce(grad_f0);

    if(x >= gridDim.x) g_odata[x] = 0.0;

    if(threadIdx.x == 0) g_odata[blockIdx.x] = grad_f0;
}

__global__ void Comput_grad2_kernel(cudaField grad_f, REAL grad_f_av1, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field(grad_f, x, y, z) = get_Field(grad_f, x, y, z)*grad_f_av1;
    }
}

void Comput_grad(cudaField *P, hipStream_t *stream){
    cudaField Pk_d, Pi_d, Ps_d;

    Pk_d.pitch = pdu_d->pitch; Pk_d.ptr = pdu_d->ptr;
    Pi_d.pitch = pdu_d->pitch; Pi_d.ptr = pdu_d->ptr + pdu_d->pitch*ny*nz;
    Ps_d.pitch = pdu_d->pitch; Ps_d.ptr = pdu_d->ptr + 2 * pdu_d->pitch*ny*nz;
    grad_P.pitch = pdu_d->pitch; grad_P.ptr = pdu_d->ptr + 3 * pdu_d->pitch*ny*nz;

    cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));

    OCFD_dx0(*P, Pk_d, job, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
    OCFD_dy0(*P, Pi_d, job, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
    OCFD_dz0(*P, Ps_d, job, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);


    dim3 size, griddim, blockdim;
    job.setup(dim3(0, 0, 0), dim3(nx, ny, nz));
	jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

    REAL *g_odata;
    REAL *Sum = (REAL *)malloc(sizeof(REAL));

    unsigned int g_odata_size = griddim.x*griddim.y*griddim.z;
    CUDA_LAUNCH(( hipMalloc((REAL **)&g_odata, g_odata_size*sizeof(REAL)) ));

    int SMEMDIM = blockdim.x*blockdim.y*blockdim.z/64;   //Warpsize is 64
    CUDA_LAUNCH((hipLaunchKernelGGL(Comput_grad1_kernel, dim3(griddim), dim3(blockdim), SMEMDIM, *stream, Pk_d, Pi_d, Ps_d, *pAkx_d, *pAky_d, 
        *pAkz_d, *pAix_d, *pAiy_d, *pAiz_d, *pAsx_d, *pAsy_d, *pAsz_d, SMEMDIM, grad_P, g_odata, job)));

    dim3 blockdim_sum(512);
    dim3 griddim_sum(g_odata_size); 

    do{
        griddim_sum.x = (griddim_sum.x + blockdim_sum.x - 1)/blockdim_sum.x;
        CUDA_LAUNCH(( hipLaunchKernelGGL(add_kernel, dim3(griddim_sum), dim3(blockdim_sum), 8, *stream, g_odata, g_odata_size) ));
    } while(griddim_sum.x > 1);

    CUDA_LAUNCH(( hipMemcpy(Sum, g_odata, sizeof(REAL), hipMemcpyDeviceToHost) ));
    CUDA_LAUNCH(( hipFree(g_odata) ));

    REAL grad_f_av, grad_f_av1;

    MPI_Allreduce(Sum, &grad_f_av, 1, OCFD_DATA_TYPE, MPI_SUM, MPI_COMM_WORLD);

    grad_f_av = grad_f_av/(NX_GLOBAL * NY_GLOBAL * NZ_GLOBAL);
    grad_f_av1 = 1.0/grad_f_av;

    CUDA_LAUNCH((hipLaunchKernelGGL(Comput_grad2_kernel, dim3(griddim), dim3(blockdim), 0, *stream, grad_P, grad_f_av1, job)));
}
//----------------------------------------------------------------------------------------------------------

//---------------------------------------------Modify Negative T--------------------------------------------
__global__ void ana_NT_kernel(cudaField T, cudaField grad_f, REAL P_intvs, cudaJobPackage job){
    // field with LAP
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
	
	if(x < job.end.x && y < job.end.y && z < job.end.z){
        REAL t;
        t = get_Field_LAP(T , x,y,z);
        if(t < 0){
            t = get_Field_LAP(T , x-1 , y , z) + get_Field_LAP(T , x+1 , y , z)
               +get_Field_LAP(T , x , y-1 , z) + get_Field_LAP(T , x , y+1 , z)
               +get_Field_LAP(T , x , y , z-1) + get_Field_LAP(T , x , y , z+1);
            get_Field_LAP(T , x,y,z) = t/6.0;

            get_Field(grad_f, x, y, z) = fmax(10.*get_Field_LAP(grad_f, x, y, z), P_intvs + 1.);
        }
	}
}


void modify_NT(hipStream_t *stream){
    dim3 griddim , blockdim;
    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
    cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
    hipLaunchKernelGGL(ana_NT_kernel, dim3(griddim ), dim3(blockdim), 0, *stream, *pT_d, grad_P,  HybridAuto.P_intvs[1], job);
}


//----------------------------------------------Smoothing_dp------------------------------------------------
__global__ void Modify_P_kernel(cudaField f, cudaField grad_f, REAL P_intvs, cudaJobPackage job){
    unsigned int x = (blockDim.x * blockIdx.x + threadIdx.x) + job.start.x;
	unsigned int y = (blockDim.y * blockIdx.y + threadIdx.y) + job.start.y;
    unsigned int z = (blockDim.z * blockIdx.z + threadIdx.z) + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        REAL ff;
        get_Field_LAP(f, x, y, z) = ff = get_Field(grad_f, x-LAP, y-LAP, z-LAP);
        if(ff >= P_intvs) get_Field_LAP(f, x, y, z) = 3*ff;
    }
}

__global__ void Modify_grad_inner_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field(grad_f, x-LAP, y-LAP, z-LAP) = get_Field_LAP(f, x, y, z)/3.0 
                            + (get_Field_LAP(f, x+1, y, z) + get_Field_LAP(f, x-1, y, z)
                            +  get_Field_LAP(f, x, y+1, z) + get_Field_LAP(f, x, y-1, z)
                            +  get_Field_LAP(f, x, y, z+1) + get_Field_LAP(f, x, y, z-1))/9.0;
    }
}

__global__ void Modify_grad_outer_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field(grad_f, x, y, z) = get_Field_LAP(f, x+LAP, y+LAP, z+LAP); 
    }
}

void Smoothing_dp(hipStream_t *stream){
    REAL P_intvs = HybridAuto.P_intvs[HybridA_Stage - 1];

    cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
    dim3 size, griddim, blockdim;

    jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
    CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_P_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, P_intvs, job) ));

    exchange_boundary_xyz_Async_packed_dev(pP, pPP_d, stream);

    CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_inner_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job) ))

	//---------------------------------------------------------------------------------------------
	if (npx == 0 && Iperiodic[0] == 0)
	{	
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(0, 0, 0), dim3(1, ny, nz));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job_outer) ))
	}
	if (npx == NPX0 - 1 && Iperiodic[0] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(nx-1, 0, 0), dim3(nx, ny, nz));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job_outer) ))
	}
	//---------------------------------------------------------------------------------------------
	if (npy == 0 && Iperiodic[1] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(0, 0, 0), dim3(nx, 1, nz));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job_outer) ))
	}

	if (npy == NPY0 - 1 && Iperiodic[1] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(0, ny-1, 0), dim3(nx, ny, nz));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job_outer) ))
	}
	//----------------------------------------------------------------------------------------------
	if (npz == 0 && Iperiodic[2] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(0, 0, 0), dim3(nx, ny, 1));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job_outer) ))
	}

	if (npz == NPZ0 - 1 && Iperiodic[2] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(0, 0, nz-1), dim3(nx, ny, nz));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_grad_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job_outer) ))
	}
}
//-------------------------------------------------------------------------------------------------------------------


//----------------------------------------------------Patch_zones----------------------------------------------------
__global__ void Patch_zones_kernel(cudaField grad_f, REAL Pa_zones, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field(grad_f, x, y, z) = Pa_zones;
    }
}

void Patch_zones(hipStream_t *stream){
    int node_ib, node_ie, node_jb, node_je, node_kb, node_ke;
    int ib, ie, jb, je, kb, ke;

    for(int i = 0; i < HybridAuto.Num_Patch_zones; i++){
        int (*HybridAuto_zones)[6] = (int(*)[6])HybridAuto.zones;
        int flag_i = 1, flag_j = 1, flag_k = 1;

        get_i_node(HybridAuto_zones[i][0], &node_ib, &ib);
        get_i_node(HybridAuto_zones[i][1], &node_ie, &ie);
        get_j_node(HybridAuto_zones[i][2], &node_jb, &jb);
        get_j_node(HybridAuto_zones[i][3], &node_je, &je);
        get_k_node(HybridAuto_zones[i][4], &node_kb, &kb);
        get_k_node(HybridAuto_zones[i][5], &node_ke, &ke);

        if(node_ib < npx) ib = 0;
        if(node_ib > npx) flag_i = 0;
        if(node_ie > npx) ie = nx;
        if(node_ie < npx) flag_i = 0;

        if(node_jb < npy) jb = 0;
        if(node_jb > npy) flag_j = 0;
        if(node_je > npy) je = ny;
        if(node_je < npy) flag_j = 0;

        if(node_kb < npz) kb = 0;
        if(node_kb > npz) flag_k = 0;
        if(node_ke > npz) ke = nz;
        if(node_ke < npz) flag_k = 0;

        if(flag_i*flag_j*flag_k != 0){
            cudaJobPackage job(dim3(ib, jb, kb) , dim3(ie, je, ke));

            dim3 size, griddim, blockdim;
            jobsize(&job, &size);
            cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

            REAL Pa_zones = HybridAuto.Pa_zones[i];
            CUDA_LAUNCH(( hipLaunchKernelGGL(Patch_zones_kernel, dim3(griddim), dim3(blockdim), 0, *stream, grad_P, Pa_zones, job) ));
        }
    }
}
//---------------------------------------------------------------------------------------------------------------


//----------------------------------------------Boundary_dp-----------------------------------------------------
__global__ void Modify_P_all_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + f.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        get_Field_LAP(f, x, y, z, offset) = get_Field(grad_f, x, y, z);
    }
}

__global__ void Modify_x_P_outer_kernel(cudaField f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(f, x-1, y, z) = get_Field_LAP(f, x, y, z); 
    }
}

__global__ void Modify_x_M_outer_kernel(cudaField f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(f, x+1, y, z) = get_Field_LAP(f, x, y, z); 
    }
}

__global__ void Modify_y_P_outer_kernel(cudaField f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(f, x, y-1, z) = get_Field_LAP(f, x, y, z); 
    }
}

__global__ void Modify_y_M_outer_kernel(cudaField f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(f, x, y+1, z) = get_Field_LAP(f, x, y, z); 
    }
}

__global__ void Modify_z_P_outer_kernel(cudaField f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(f, x, y, z-1) = get_Field_LAP(f, x, y, z); 
    }
}

__global__ void Modify_z_M_outer_kernel(cudaField f, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
    
    if(x < job.end.x && y < job.end.y && z < job.end.z){
        get_Field_LAP(f, x, y, z+1) = get_Field_LAP(f, x, y, z); 
    }
}

void Boundary_dp(hipStream_t *stream){
    cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
    dim3 size, griddim, blockdim;

    jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
    CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_P_all_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, grad_P, job) ));

    exchange_boundary_xyz_Async_packed_dev(pP, pPP_d, stream);

    //---------------------------------------------------------------------------------------------
	if (npx == 0 && Iperiodic[0] == 0)
	{	
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(LAP+1, ny+LAP, nz+LAP));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_x_P_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, job_outer) ))
	}
	if (npx == NPX0 - 1 && Iperiodic[0] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(nx+LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_x_M_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, job_outer) ))
	}
	//---------------------------------------------------------------------------------------------
	if (npy == 0 && Iperiodic[1] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(nx+LAP, LAP+1, nz+LAP));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_y_P_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, job_outer) ))
	}

	if (npy == NPY0 - 1 && Iperiodic[1] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(LAP, ny+LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_y_M_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, job_outer) ))
	}
	//----------------------------------------------------------------------------------------------
	if (npz == 0 && Iperiodic[2] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(nx+LAP, ny+LAP, LAP+1));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_z_P_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, job_outer) ))
	}

	if (npz == NPZ0 - 1 && Iperiodic[2] == 0)
	{
		dim3 griddim, blockdim;
		cudaJobPackage job_outer(dim3(LAP, LAP, nz+LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));

		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
        CUDA_LAUNCH(( hipLaunchKernelGGL(Modify_z_M_outer_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, job_outer) ))
	}
}
//---------------------------------------------------------------------------------------------------------------------


//-----------------------------------------------Comput_Scheme_point---------------------------------------------------
__global__ void Comput_Scheme_point_x_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
        int kp = 1;
        if(dp0 > P_intvs1) kp += 1;
        if(dp0 > P_intvs2) kp += 1;
        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
    }
}

__global__ void Comput_Scheme_point_y_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
        int kp = 1;
        if(dp0 > P_intvs1) kp += 1;
        if(dp0 > P_intvs2) kp += 1;
        *(scheme.ptr + (x + scheme.pitch *(y + (z)*(ny_d + 1)))) = kp;
    }
}

__global__ void Comput_Scheme_point_z_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
        int kp = 1;
        if(dp0 > P_intvs1) kp += 1;
        if(dp0 > P_intvs2) kp += 1;
        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
    }
}

void Comput_Scheme_point(hipStream_t *stream){

    dim3 size, griddim, blockdim;
    cudaJobPackage job(dim3(LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));

	jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
    
    CUDA_LAUNCH(( hipLaunchKernelGGL(Comput_Scheme_point_x_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, *HybridAuto.scheme_x, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3

    job.setup(dim3(LAP, LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));

    jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

    CUDA_LAUNCH(( hipLaunchKernelGGL(Comput_Scheme_point_y_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, *HybridAuto.scheme_y, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3

    job.setup(dim3(LAP, LAP, LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));

    jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

    CUDA_LAUNCH(( hipLaunchKernelGGL(Comput_Scheme_point_z_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pPP_d, *HybridAuto.scheme_z, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
}


__global__ void Comput_Scheme_point_x_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
        (get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));

        dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
        (get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));

        dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
        (get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));

        int kp = 1;
        if(dp0 > P_intvs1) kp += 1;
        if(dp0 > P_intvs2) kp += 1;
        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
    }
}

__global__ void Comput_Scheme_point_y_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
        (get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));

        dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
        (get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));

        dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
        (get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));

        int kp = 1;
        if(dp0 > P_intvs1) kp += 1;
        if(dp0 > P_intvs2) kp += 1;
        *(scheme.ptr + (x + scheme.pitch *(y + (z)*(ny_d + 1)))) = kp;
    }
}

__global__ void Comput_Scheme_point_z_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
    
    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
        REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
        (get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));

        dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
        (get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));

        dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
        (get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));

        int kp = 1;
        if(dp0 > P_intvs1) kp += 1;
        if(dp0 > P_intvs2) kp += 1;
        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
    }
}

void Comput_Scheme_point_Jameson(hipStream_t *stream){

    dim3 size, griddim, blockdim;
    cudaJobPackage job(dim3(LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));

	jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
    
    CUDA_LAUNCH(( hipLaunchKernelGGL(Comput_Scheme_point_x_Jameson_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pP_d, *HybridAuto.scheme_x, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3

    job.setup(dim3(LAP, LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));

    jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

    CUDA_LAUNCH(( hipLaunchKernelGGL(Comput_Scheme_point_y_Jameson_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pP_d, *HybridAuto.scheme_y, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3

    job.setup(dim3(LAP, LAP, LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));

    jobsize(&job, &size);
    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);

    CUDA_LAUNCH(( hipLaunchKernelGGL(Comput_Scheme_point_z_Jameson_kernel, dim3(griddim), dim3(blockdim), 0, *stream, *pP_d, *HybridAuto.scheme_z, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
}


void HybridAuto_scheme_IO(){
    memcpy_All_int(scheme_x, HybridAuto.scheme_x->ptr, HybridAuto.scheme_x->pitch, D2H, nx+1, ny, nz);
    memcpy_All_int(scheme_y, HybridAuto.scheme_y->ptr, HybridAuto.scheme_y->pitch, D2H, nx, ny+1, nz);
    memcpy_All_int(scheme_z, HybridAuto.scheme_z->ptr, HybridAuto.scheme_z->pitch, D2H, nx, ny, nz+1);
    memcpy_All(pP, pPP_d->ptr , pPP_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);

    FILE *fp; 
    char fp_name[120];

    if(my_id == 0){
        sprintf(fp_name, "Scheme_x%08d.dat", Istep);
        fp = fopen(fp_name, "w");
    }
    write_2d_XY(fp, NZ_GLOBAL/2, nx+1, ny, 0, scheme_x, pP);

    if(my_id == 0){
        fclose(fp);
        sprintf(fp_name, "Scheme_y%08d.dat", Istep);
        fp = fopen(fp_name, "w");
    }
    write_2d_XY(fp, NZ_GLOBAL/2, nx, ny+1, 0, scheme_y, pP);
    
    if(my_id == 0){
        fclose(fp);
        sprintf(fp_name, "Scheme_z%08d.dat", Istep);
        fp = fopen(fp_name, "w");
    }
    write_2d_XY(fp, NZ_GLOBAL/2, nx, ny, 0, scheme_z, pP);
    
    if(my_id == 0) fclose(fp);
}


void HybridAuto_scheme_Proportion(){
    memcpy_All_int(scheme_x, HybridAuto.scheme_x->ptr, HybridAuto.scheme_x->pitch, D2H, nx+1, ny, nz);
    memcpy_All_int(scheme_y, HybridAuto.scheme_y->ptr, HybridAuto.scheme_y->pitch, D2H, nx, ny+1, nz);
    memcpy_All_int(scheme_z, HybridAuto.scheme_z->ptr, HybridAuto.scheme_z->pitch, D2H, nx, ny, nz+1);

    double type1 = 0.0, type2 = 0.0, type3 = 0.0;
    double Sum_type1, Sum_type2, Sum_type3;

    int tmp = (nx + 1) * ny * nz;
    for(int i = 0; i < tmp; i++){
        if(*(scheme_x + i) == 1){ 
            type1 += 1.0;
        }else if(*(scheme_x + i) == 2){
            type2 += 1.0;
        }else{
            type3 += 1.0;
        }
    }

    type1 /= NY_GLOBAL*NZ_GLOBAL;
    type2 /= NY_GLOBAL*NZ_GLOBAL;
    type3 /= NY_GLOBAL*NZ_GLOBAL;

    MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    tmp = NX_GLOBAL + NPX0;

    char scheme_percent[] = "The first type scheme of Hybrid schemes in direction %s is \033[34m%lf%\033[0m, second is"
 "\033[34m%lf%\033[0m， third is \033[34m%lf%\033[0m\n";

    if(my_id == 0) printf(scheme_percent, "X", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);

    type1 = 0.0; type2 = 0.0; type3 = 0.0;

    tmp = nx * (ny + 1) * nz;
    for(int i = 0; i < tmp; i++){
        if(*(scheme_y + i) == 1){ 
            type1 += 1.0;
        }else if(*(scheme_y + i) == 2){
            type2 += 1.0;
        }else{
            type3 += 1.0;
        }
    }

    type1 /= NX_GLOBAL*NZ_GLOBAL;
    type2 /= NX_GLOBAL*NZ_GLOBAL;
    type3 /= NX_GLOBAL*NZ_GLOBAL;

    MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    tmp = (NY_GLOBAL + NPY0);

    if(my_id == 0) printf(scheme_percent, "Y", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);

    type1 = 0.0; type2 = 0.0; type3 = 0.0;

    tmp = nx * ny * (nz + 1);
    for(int i = 0; i < tmp; i++){
        if(*(scheme_z + i) == 1){ 
            type1 += 1.0;
        }else if(*(scheme_z + i) == 2){
            type2 += 1.0;
        }else{
            type3 += 1.0;
        }
    }

    type1 /= NX_GLOBAL*NY_GLOBAL;
    type2 /= NX_GLOBAL*NY_GLOBAL;
    type3 /= NX_GLOBAL*NY_GLOBAL;

    MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    tmp = (NZ_GLOBAL + NPZ0);

    if(my_id == 0) printf(scheme_percent, "Z", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);

}


__device__ int get_Hyscheme_flag_p_kernel(int flagxyz, dim3 coords, cudaField_int scheme, cudaJobPackage job){
	unsigned int x = coords.x + job.start.x;
	unsigned int y = coords.y + job.start.y;
	unsigned int z = coords.z + job.start.z;
    int Hyscheme_flag;

	switch(flagxyz){
		case 1:
		case 4:
        Hyscheme_flag = *(scheme.ptr + (x + 1 - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
		return Hyscheme_flag;
		break;

	    case 2:
	    case 5:
        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y + 1 - LAP + (z - LAP)*(ny_d + 1))));
	    return Hyscheme_flag;
	    break;

		case 3:
		case 6:
        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z + 1 - LAP)*ny_d)));
		return Hyscheme_flag;
		break;
	}

    return 0;
}


__global__ void OCFD_HybridAuto_P_kernel(dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
    HIP_DYNAMIC_SHARED( REAL, sort)
	dim3 coords;
	REAL stencil[8];
    int Hyscheme_flag;
	int ia1 = -3; int ib1 = 4;
	

    for(int i = 0; i < 5; i++){

		int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);

		if(flag != 0){
			REAL tmp_r, tmp_l;

            if(i == 0) Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);

            flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  

			if(flag != 0){
                if(Hyscheme_flag == 1){
                    tmp_r = OCFD_OMP6_kernel_P(0, &stencil[0]);
                }else if(Hyscheme_flag == 2){
                    tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
                }else{
                    tmp_r = OCFD_NND2_kernel_P(&stencil[2]);
                }
            }

			tmp_l = __shfl_up_double(tmp_r, 1, hipWarpSize);

            if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
		}
	}

}


__global__ void OCFD_HybridAuto_P_Jameson_kernel(dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
    HIP_DYNAMIC_SHARED( REAL, sort)
	dim3 coords;
	REAL stencil[8];
    int Hyscheme_flag;
	int ia1 = -3; int ib1 = 4;
	

    for(int i = 0; i < 5; i++){

		int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);

		if(flag != 0){
			REAL tmp_r, tmp_l;

            if(i == 0) Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);

            flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  

			if(flag != 0){
                if(Hyscheme_flag == 1){
                    tmp_r = OCFD_UP7_kernel_P(&stencil[0]);
                }else if(Hyscheme_flag == 2){
                    tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
                }else{
                    tmp_r = OCFD_weno5_kernel_P(&stencil[1]);
                }
            }

			tmp_l = __shfl_up_double(tmp_r, 1, hipWarpSize);

            if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
		}
	}

}


__device__ int get_Hyscheme_flag_m_kernel(int flagxyz, dim3 coords, cudaField_int scheme, cudaJobPackage job){
	unsigned int x = coords.x + job.start.x;
	unsigned int y = coords.y + job.start.y;
	unsigned int z = coords.z + job.start.z;
    int Hyscheme_flag;

	switch(flagxyz){
		case 1:
		case 4:
        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
		return Hyscheme_flag;
        break;

	    case 2:
	    case 5:
        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*(ny_d + 1))));
	    return Hyscheme_flag;
	    break;

		case 3:
		case 6:
        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
		return Hyscheme_flag;
		break;
	}

    return 0;
}


__global__ void OCFD_HybridAuto_M_kernel(dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
    HIP_DYNAMIC_SHARED( REAL, sort)
	dim3 coords;
	REAL stencil[8];
    int Hyscheme_flag;

	int ia1 = -4; int ib1 = 3;


    for(int i = 0; i < 5; i++){

        int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
    
        if(flag != 0){
            REAL tmp_r, tmp_l; 

            if(i == 0) Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
    
            flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
    
            if(flag != 0){
                if(Hyscheme_flag == 1){
                    tmp_r = OCFD_OMP6_kernel_M(0, &stencil[0]);
                }else if(Hyscheme_flag == 2){
                    tmp_r = OCFD_weno7_kernel_M(&stencil[1]);
                }else{
                    tmp_r = OCFD_NND2_kernel_M(&stencil[3]);
                }
            }
    
            tmp_l = __shfl_up_double(tmp_r, 1, hipWarpSize);
    
            if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
        }
    }
}


__global__ void OCFD_HybridAuto_M_Jameson_kernel(dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
    HIP_DYNAMIC_SHARED( REAL, sort)
	dim3 coords;
	REAL stencil[8];
    int Hyscheme_flag;

	int ia1 = -4; int ib1 = 3;


    for(int i = 0; i < 5; i++){

        int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
    
        if(flag != 0){
            REAL tmp_r, tmp_l; 

            if(i == 0) Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
    
            flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
    
            if(flag != 0){
                if(Hyscheme_flag == 1){
                    tmp_r = OCFD_UP7_kernel_M(&stencil[0]);
                }else if(Hyscheme_flag == 2){
                    tmp_r = OCFD_weno7_kernel_M(&stencil[1]);
                }else{
                    tmp_r = OCFD_weno5_kernel_M(&stencil[2]);
                }
            }
    
            tmp_l = __shfl_up_double(tmp_r, 1, hipWarpSize);
    
            if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
        }
    }
}


#ifdef __cplusplus
}
#endif