first commit

c0b0318b · ccfd · c0b0318b · c0b0318b · c0b0318b · c0b0318b
Commit c0b0318b authored Jul 21, 2022 by ccfd
20 changed files
--- a/src/OCFD_boundary_compression_conner.cu
+++ b/src/OCFD_boundary_compression_conner.cu
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "parameters.h"
+#include "parameters_d.h"
+#include "utility.h"
+#include "io_warp.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+
+#include "OCFD_boundary_init.h"
+#include "OCFD_boundary_compression_conner.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+extern int x_begin;
+extern cudaField *pub1_d;
+extern cudaField *pfx_d;
+extern cudaField *pgz_d;
+extern REAL *fait;
+extern REAL *TM;
+
+
+__global__ void do_ub1_inlet_kernel(cudaField d, cudaField u, cudaField v, cudaField w, cudaField T, cudaField ub1, cudaJobPackage job){
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if(y < job.end.y && z < job.end.z){
+        unsigned int ylap = y - LAP;
+        for(int i = 0; i <= LAP; i++){
+            get_Field_LAP(d, i, y, z) = *(ub1.ptr + ylap);
+            get_Field_LAP(u, i, y, z) = *(ub1.ptr + ylap + ub1.pitch * 1);
+            get_Field_LAP(v, i, y, z) = *(ub1.ptr + ylap + ub1.pitch * 2);
+            get_Field_LAP(w, i, y, z) = 0.;
+            get_Field_LAP(T, i, y, z) = *(ub1.ptr + ylap + ub1.pitch * 3);
+        }
+    }
+}
+
+
+__global__ void do_ub1_top_kernel(cudaField d, cudaField u, cudaField v, cudaField w, cudaField T, cudaField ub1, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if(x < job.end.x && z < job.end.z){
+        get_Field_LAP(d, x, ny_lap_d - 1, z) = *(ub1.ptr + ny_d - 1);
+        get_Field_LAP(u, x, ny_lap_d - 1, z) = *(ub1.ptr + ub1.pitch * 1 + ny_d - 1);
+        get_Field_LAP(v, x, ny_lap_d - 1, z) = *(ub1.ptr + ub1.pitch * 2 + ny_d - 1);
+        get_Field_LAP(w, x, ny_lap_d - 1, z) = 0.;
+        get_Field_LAP(T, x, ny_lap_d - 1, z) = *(ub1.ptr + ub1.pitch * 3 + ny_d - 1);
+
+    }
+}
+
+
+__global__ void do_wall_dist_kernel(cudaField d, cudaField u, cudaField v, cudaField w, cudaField T, 
+REAL HT, REAL epsl, cudaField fx, cudaField gz, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if(x < job.end.x && z < job.end.z){
+        unsigned int xlap = x - LAP;
+        unsigned int zlap = z - LAP;
+ 
+        get_Field_LAP(u, x, LAP, z) = 0.;
+        get_Field_LAP(v, x, LAP, z) = epsl * (*(fx.ptr + xlap)) * (*(gz.ptr + zlap)) * HT;
+        get_Field_LAP(w, x, LAP, z) = 0.;
+
+    }
+}
+
+
+__global__ void do_wall_Tp1_kernel(cudaField d, cudaField T, cudaField u, cudaField v, cudaField w, REAL tw, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    REAL pw;
+
+    if(x < job.end.x && z < job.end.z){
+        pw = (18. * get_Field_LAP(d, x, LAP + 1, z) * get_Field_LAP(T, x, LAP + 1, z)
+             - 9. * get_Field_LAP(d, x, LAP + 2, z) * get_Field_LAP(T, x, LAP + 2, z)
+             + 2. * get_Field_LAP(d, x, LAP + 3, z) * get_Field_LAP(T, x, LAP + 3, z))/ 11.;
+
+
+        get_Field_LAP(T, x, LAP, z) = tw;
+        get_Field_LAP(d, x, LAP, z) = pw / get_Field_LAP(T, x, LAP, z);
+
+
+        for(int i = 0; i < LAP; i++){
+            get_Field_LAP(d, x, i, z) =  get_Field_LAP(d, x, 2*LAP-i, z);
+            get_Field_LAP(u, x, i, z) = -get_Field_LAP(u, x, 2*LAP-i, z);
+            get_Field_LAP(v, x, i, z) = -get_Field_LAP(v, x, 2*LAP-i, z);
+            get_Field_LAP(w, x, i, z) = -get_Field_LAP(w, x, 2*LAP-i, z);
+            get_Field_LAP(T, x, i, z) =  get_Field_LAP(T, x, 2*LAP-i, z);
+        }
+        
+    }
+}
+
+
+__global__ void do_wall_Tp2_kernel(cudaField d, cudaField T, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    REAL pw;
+
+    if(x < job.end.x && z < job.end.z){
+
+        get_Field_LAP(T, x, LAP, z) = (18. * get_Field_LAP(T, x, LAP + 1, z)
+                                       -9. * get_Field_LAP(T, x, LAP + 2, z)
+                                       +2. * get_Field_LAP(T, x, LAP + 3, z)) / 11.;
+        pw = (18. * get_Field_LAP(d, x, LAP + 1, z) * get_Field_LAP(T, x, LAP + 1, z)
+             - 9. * get_Field_LAP(d, x, LAP + 2, z) * get_Field_LAP(T, x, LAP + 2, z)
+             + 2. * get_Field_LAP(d, x, LAP + 3, z) * get_Field_LAP(T, x, LAP + 3, z)) / 11.;
+        get_Field_LAP(d, x, LAP, z) = pw / get_Field_LAP(T, x, LAP, z);
+        
+    }
+}
+
+
+void bc_user_Compression_conner(){
+    
+//---------------------boundary condition at i=1 -------------------------------//
+    if(npx == 0){
+        dim3 blockdim , griddim;
+        cal_grid_block_dim(&griddim, &blockdim, 1, BlockDimY, BlockDimZ, 1, ny, nz);
+        cudaJobPackage job( dim3(LAP, LAP ,LAP) , dim3(LAP+1, ny_lap, nz_lap) );
+        CUDA_LAUNCH(( do_ub1_inlet_kernel<<<griddim, blockdim>>>(*pd_d, *pu_d, *pv_d, *pw_d, *pT_d, *pub1_d, job) ));
+   
+    }
+
+    if(npy == NPY0 - 1){
+        int x_do = nx - x_begin;
+        dim3 blockdim , griddim;
+        cal_grid_block_dim(&griddim, &blockdim, BlockDimX, 1, BlockDimZ, x_do, 1, nz);
+        cudaJobPackage job( dim3(x_begin + LAP, LAP, LAP) , dim3(nx_lap, LAP+1, nz_lap) );
+        //cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, LAP+1, nz_lap) );
+        CUDA_LAUNCH(( do_ub1_top_kernel<<<griddim, blockdim>>>(*pd_d, *pu_d, *pv_d, *pw_d, *pT_d, *pub1_d, job) ));
+    }
+    
+    REAL ht = 0.;
+
+    if(BETA > 0.){
+        for(int m = 0; m < MTMAX; m++){
+            //ht = ht + TM[m] * sin((m + 1)*BETA*tt + 2.*PI*fait[m]);
+            ht = ht + TM[m] * sin((m + 1)*BETA*tt);
+        }
+    }else{
+        ht = 1.;
+    }
+
+
+//---------------------wall-boundary-condition-at-j=1---------------------------//
+    if(npy == 0){
+        dim3 blockdim , griddim;
+        cal_grid_block_dim(&griddim, &blockdim, BlockDimX, 1, BlockDimZ, nx, 1, nz);
+        cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, LAP+1, nz_lap) );
+        CUDA_LAUNCH(( do_wall_dist_kernel<<<griddim, blockdim>>>(*pd_d, *pu_d, *pv_d, *pw_d, *pT_d, ht, EPSL, *pfx_d, *pgz_d, job) ));
+
+    
+//Comput pressure and temperature-correction caused by non-wall-normal mesh ----//
+        if(IFLAG_WALL_NOT_NORMAL == 0){
+            if(TW > 0){
+            dim3 blockdim , griddim;
+                cal_grid_block_dim(&griddim, &blockdim, BlockDimX, 1, BlockDimZ, nx, 1, nz);
+                cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, LAP+1, nz_lap) );
+                CUDA_LAUNCH(( do_wall_Tp1_kernel<<<griddim, blockdim>>>(*pd_d, *pT_d, *pu_d, *pv_d, *pw_d, TW, job) ));
+            }else{
+            dim3 blockdim , griddim;
+                cal_grid_block_dim(&griddim, &blockdim, BlockDimX, 1, BlockDimZ, nx, 1, nz);
+                cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, LAP+1, nz_lap) );
+                CUDA_LAUNCH(( do_wall_Tp2_kernel<<<griddim, blockdim>>>(*pd_d, *pT_d, job) ));
+            }
+        }else{
+            printf("Now non-normal wall is not supported\n");
+        }
+    }
+}
+
+
+//void get_ht_multifrequancy(REAL HT, REAL TT, int MT_MAX, REAL beta){
+//    HT = 0.;
+//
+//    if(beta > 0.){
+//        for(int m = 0; m < MT_MAX; m++){
+//            HT = HT + TM[m] * sin((m + 1)*beta*TT + 2.*PI*fait[m]);
+//        }
+//    }else{
+//        HT = 1.;
+//    }
+//
+//}
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_boundary_init.c
+++ b/src/OCFD_boundary_init.c
+#include "math.h"
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_IO.h"
+#include "io_warp.h"
+#include "OCFD_init.h"
+#include "OCFD_boundary_Liftbody3D.h"
+#include "OCFD_Comput_Jacobian3d.h"
+#include "OCFD_boundary_init.h"
+#include "OCFD_init.h"
+
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "commen_kernel.h"
+#include "time.h"
+#include "mpi.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+// used in boudary_liftbody*********************************************************************
+    char v_dist_need = 0;
+    char TW_postive = 0;
+    REAL *pu2d_inlet; //[5][nz][ny]
+    REAL *pu2d_upper; //[5][ny][nx]
+        
+    REAL * pv_dist_wall; // [ny][nx]
+    REAL *pv_dist_coeff; // [3][ny][nx]
+    REAL * pu_dist_upper; // [ny][nx]
+
+    cudaField *pu2d_inlet_d; //[5][nz][ny]
+    cudaField *pu2d_upper_d; //[5][ny][nx]
+    //cudaField *pv_dist_wall_d; // [ny][nx]
+    cudaField *pv_dist_coeff_d; // [3][ny][nx]
+    cudaField *pu_dist_upper_d; // [ny][nx]
+    
+
+// used in boundary_compressible_conner***********************************************************
+    int x_begin;
+    REAL *pub1; // [ny][4]
+    REAL *pfx; // [nx]
+    REAL *pgz; // [nz]
+    REAL *TM; // [MTMAX]
+    REAL *fait; // [MTMAX]
+    REAL SLZ;
+
+
+void bc_parameter(){
+    switch(IBC_USER){
+        case 124:
+        {
+            IF_SYMMETRY = BC_npara[0];
+	        IF_WITHLEADING = BC_npara[1];		// 0 不含头部， 1 含头部
+	        IFLAG_UPPERBOUNDARY = BC_npara[2]; // 0 激波外； 1 激波
+            MZMAX = BC_npara[3];
+            MTMAX = BC_npara[4];
+
+	        AOA = BC_rpara[0]; // attack angle
+            Sin_AOA = sin(AOA * PI/180);
+            Cos_AOA = cos(AOA * PI/180);
+
+	        TW = BC_rpara[1]; // Wall temperature
+            EPSL_WALL = BC_rpara[2];
+            EPSL_UPPER = BC_rpara[3];
+            BETA = BC_rpara[4];
+	        WALL_DIS_BEGIN = BC_rpara[5];
+            WALL_DIS_END = BC_rpara[6];
+        }
+        break;
+
+        case 108:
+        {
+            MZMAX = BC_npara[0];
+            MTMAX = BC_npara[1];
+            INLET_BOUNDARY = BC_npara[2];
+            IFLAG_WALL_NOT_NORMAL = BC_npara[3];
+
+            TW = BC_rpara[0];
+            EPSL = BC_rpara[1];
+            X_DIST_BEGIN = BC_rpara[2];
+            X_DIST_END = BC_rpara[3];
+            BETA = BC_rpara[4];
+            X_WALL_BEGIN = BC_rpara[5];
+            X_UP_BOUNDARY_BEGIN = BC_rpara[6];
+            SLZ = BC_rpara[7];
+        }
+        break;
+
+    }
+}
+
+
+void bc_user_Liftbody3d_init()
+{
+
+    opencfd_mem_init_boundary();
+
+    // cudaMemcpyToSymbol( HIP_SYMBOL(Sin_AOA_d) , &Sin_AOA , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    // cudaMemcpyToSymbol( HIP_SYMBOL(Cos_AOA_d) , &Cos_AOA , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    // cudaMemcpyToSymbol( HIP_SYMBOL(TW_d) , &TW , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    
+	v_dist_need = 0;
+	if(TW > 0.0) TW_postive = 1; else TW_postive = 0;
+
+	int i, j, k, m;
+
+	REAL(*u2d_inlet)[nz][ny] = (REAL(*)[nz][ny])pu2d_inlet; //[5][nz][ny]
+	REAL(*u2d_upper)[ny][nx] = (REAL(*)[ny][nx])pu2d_upper; //[5][ny][nx]
+
+	REAL(*v_dist_wall)[nx]  = (REAL(*)[nx])pv_dist_wall;
+	REAL(*u_dist_upper)[nx] = (REAL(*)[nx])pu_dist_upper;
+
+	//--------------------------------------------------------------------------
+
+	// amplitude of wall blow and suction disturbance
+
+
+	REAL tmp2d[NZ_GLOBAL][NY_GLOBAL];  //NY_GLOBAL,NZ_GLOBAL
+
+	//--------------Inlet boundary condition ---------------------
+	if (IF_WITHLEADING == 1)
+	{
+		FILE *file;
+		if (my_id == 0)
+		{
+			file = fopen("flow-inlet-section.dat", "r");
+			printf("read inlet boundary data: flow-inlet-section.dat\n");
+		}
+		int j1, k1;
+		for (m = 0; m < 5; m++)
+		{
+			if (my_id == 0) FREAD(tmp2d, sizeof(REAL), NZ_GLOBAL * NY_GLOBAL, file)
+			MPI_Bcast(tmp2d, NY_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+			for (k = 0; k < nz; k++)
+			{
+				for (j = 0; j < ny; j++)
+				{
+					k1 = k_offset[npz] + k;
+					j1 = j_offset[npy] + j;
+					u2d_inlet[m][k][j] = tmp2d[k1][j1];
+				}
+			}
+		}
+
+		if (my_id == 0)
+			fclose(file);
+
+	}else{
+        REAL (*d)[ny+2*LAP][nx+2*LAP] = (REAL(*)[ny+2*LAP][nx+2*LAP])pd;
+        REAL (*u)[ny+2*LAP][nx+2*LAP] = (REAL(*)[ny+2*LAP][nx+2*LAP])pu;
+        REAL (*v)[ny+2*LAP][nx+2*LAP] = (REAL(*)[ny+2*LAP][nx+2*LAP])pv;
+        REAL (*w)[ny+2*LAP][nx+2*LAP] = (REAL(*)[ny+2*LAP][nx+2*LAP])pw;
+        REAL (*T)[ny+2*LAP][nx+2*LAP] = (REAL(*)[ny+2*LAP][nx+2*LAP])pT;
+
+        for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				u2d_inlet[0][k][j] = d[k+LAP][j+LAP][LAP];
+				u2d_inlet[1][k][j] = u[k+LAP][j+LAP][LAP];
+				u2d_inlet[2][k][j] = v[k+LAP][j+LAP][LAP];
+				u2d_inlet[3][k][j] = w[k+LAP][j+LAP][LAP];
+				u2d_inlet[4][k][j] = T[k+LAP][j+LAP][LAP];
+			}
+		}
+    }
+
+	memcpy_All(pu2d_inlet, pu2d_inlet_d->ptr, pu2d_inlet_d->pitch, H2D, ny, nz, 5);
+
+    //if (Init_stat == 2) liftbody_init();
+	//----------Upper boundary conditon -----------------
+	if (IFLAG_UPPERBOUNDARY == 1)
+	{
+		REAL tmp2d1[NY_GLOBAL][NX_GLOBAL]; //NX_GLOBAL,NY_GLOBAL
+		FILE *file;
+		if (my_id == 0)
+		{
+			file = fopen("flow-outboundary.dat", "r");
+			printf("read upper boundary data: flow-outboundary.dat\n");
+		}
+		int j1, i1;
+		for (m = 0; m < 5; m++)
+		{
+			if (my_id == 0) FREAD(tmp2d1, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+			MPI_Bcast(tmp2d1, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					j1 = j_offset[npy] + j;
+					i1 = i_offset[npx] + i;
+					u2d_upper[m][j][i] = tmp2d1[j1][i1];
+				}
+			}
+		}
+		memcpy_All(pu2d_upper , pu2d_upper_d->ptr , pu2d_upper_d->pitch , H2D , nx,ny,5);
+
+		if (my_id == 0)
+			fclose(file);
+	}
+
+	//------random wall disturbance ------------------------------
+	REAL(*Axx)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pAxx, nx + 2 * LAP, ny + 2 * LAP);
+	REAL rand_x;
+	srand((unsigned)time(NULL));
+
+    REAL tmp_v_dist[NY_GLOBAL][NX_GLOBAL];
+    REAL tmp_u_dist[NY_GLOBAL][NX_GLOBAL];
+
+    REAL(*Ayy)[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pAyy, nx + 2 * LAP, ny + 2 * LAP);
+	REAL(*Azz)[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pAzz, nx + 2 * LAP, ny + 2 * LAP);
+
+	REAL(*v_dist_coeff)[ny][nx] = (REAL(*)[ny][nx])pv_dist_coeff;
+
+    int ilap , jlap;
+
+    if(MZMAX != 0){
+        v_dist_need = 1;
+        if(my_id == 0) printf("Disturbance has been added, MZMAX is %d\n", MZMAX);
+
+        REAL *xa = (REAL*)malloc(sizeof(REAL)*nx);
+
+        for(int i = 0; i < nx; i++){
+            xa[i] = Axx[LAP][LAP][i + LAP];
+        }
+
+        get_xy_blow_suction_multiwave(nx, MZMAX, xa, pfx, pgz, WALL_DIS_BEGIN, WALL_DIS_END);
+
+        for (j = 0; j < ny; j++)
+	    {
+	    	for (i = 0; i < nx; i++)
+	    	{
+
+		    	v_dist_wall[j][i] = pfx[i]*pgz[j];
+    		}
+    	}
+
+	    for (j = LAP; j < ny + LAP; j++)
+	    {   
+            jlap = j-LAP;
+	    	for (i = LAP; i < nx + LAP; i++)
+	    	{
+                ilap =i-LAP;
+
+	    		v_dist_coeff[0][jlap][ilap]= 0;
+	    		v_dist_coeff[1][jlap][ilap]= EPSL_WALL * v_dist_wall[jlap][ilap] * sin(2*PI/NY_GLOBAL*(jlap+j_offset[npy]));
+	    		v_dist_coeff[2][jlap][ilap]= EPSL_WALL * v_dist_wall[jlap][ilap] * cos(2*PI/NY_GLOBAL*(jlap+j_offset[npy]));
+	    	}
+	    }
+
+    }else{
+
+	    for (j = 0; j < NY_GLOBAL; j++)
+	    {
+	    	for (i = 0; i < NX_GLOBAL; i++)
+	    	{
+	    		rand_x = (rand() / (REAL)RAND_MAX - 0.5) * 2.0;
+	    		tmp_v_dist[j][i] = EPSL_WALL * rand_x;
+
+	    		rand_x = (rand() / (REAL)RAND_MAX - 0.5) * 2.0;
+	    		tmp_u_dist[j][i] = EPSL_UPPER * rand_x;
+	    	}
+	    }
+
+        MPI_Bcast(tmp_v_dist, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+        MPI_Bcast(tmp_u_dist, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+
+	    int j1, i1;
+	    for (j = 0; j < ny; j++)
+	    {
+	    	for (i = 0; i < nx; i++)
+	    	{
+	    		i1 = i_offset[npx] + i;
+                j1 = j_offset[npy] + j;
+
+		    	if (Axx[LAP][j + LAP][i + LAP] >= WALL_DIS_BEGIN && Axx[LAP][j + LAP][i + LAP] <= WALL_DIS_END)
+		    	{
+		    		v_dist_need = 1;
+		    		v_dist_wall[j][i] = tmp_v_dist[j1][i1];
+		    	}
+		    	else
+		    	{
+		    		v_dist_wall[j][i] = 0.0;
+		    	}
+
+	    		u_dist_upper[j][i] = tmp_u_dist[j1][i1];
+	    	}
+	    }
+
+	    //memcpy_All(pv_dist_wall  , pv_dist_wall_d->ptr  , pv_dist_wall_d->pitch  , H2D , nx,ny,1);
+	    memcpy_All(pu_dist_upper, pu_dist_upper_d->ptr, pu_dist_upper_d->pitch, H2D, nx, ny, 1);
+
+        REAL xn, yn, zn, sn;
+
+	    for (j = LAP; j < ny + LAP; j++)
+	    {   jlap = j-LAP;
+	    	for (i = LAP; i < nx + LAP; i++)
+	    	{
+                ilap =i-LAP;
+	    		xn = Axx[LAP+1][j][i] - Axx[LAP][j][i];
+	    		yn = Ayy[LAP+1][j][i] - Ayy[LAP][j][i];
+	    		zn = Azz[LAP+1][j][i] - Azz[LAP][j][i];
+	    		sn = sqrt(xn * xn + yn * yn + zn * zn);
+
+	    		v_dist_coeff[0][jlap][ilap]= v_dist_wall[jlap][ilap] * xn / sn;
+	    		v_dist_coeff[1][jlap][ilap]= v_dist_wall[jlap][ilap] * yn / sn;
+	    		v_dist_coeff[2][jlap][ilap]= v_dist_wall[jlap][ilap] * zn / sn;
+	    	}
+	    }
+    }
+
+    get_fait_multifrequancy(MTMAX);//Comput TM
+    
+	memcpy_All(pv_dist_coeff, pv_dist_coeff_d->ptr, pv_dist_coeff_d->pitch, H2D, nx, ny, 3);
+}
+
+void get_xy_blow_suction_multiwave(int NX, int MZ_MAX, REAL *xx,
+REAL *fx, REAL *gz, REAL DIST_BEGIN, REAL DIST_END){
+    int MZ_MAX1;
+    REAL ztmp, seta;
+    REAL *faiz, *zl;
+    
+    MZ_MAX1 = abs(MZ_MAX);
+    faiz = (REAL*)malloc(sizeof(REAL)*MZ_MAX1);
+    zl = (REAL*)malloc(sizeof(REAL)*MZ_MAX1);
+    
+    ztmp = 0.;
+    
+    for(int k = 0; k < MZ_MAX1; k++){
+        faiz[k] = rand()/(REAL)RAND_MAX;
+        if(k == 0){
+            zl[k] = 1.;
+        }else{
+            zl[k] = zl[k - 1] / 1.25;
+        }
+        ztmp = ztmp + zl[k];
+    }
+    
+    for(int k = 0; k < MZ_MAX1; k++){
+        zl[k] = zl[k] / ztmp;
+    }
+    
+    MPI_Bcast(faiz, MZ_MAX1, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+    
+    for(int i = 0; i < NX; i++){
+        if(xx[i] >= DIST_BEGIN && xx[i] <= DIST_END){
+            //seta = 2. * PI * (xx[i] - DIST_BEGIN)/(DIST_BEGIN - DIST_END);
+            seta = 10. * PI * (xx[i] - DIST_BEGIN)/(DIST_BEGIN - DIST_END);
+            fx[i] = 4. / sqrt(27.) * sin(seta) * (1. - cos(seta));
+        }else{
+            fx[i] = 0.;
+        }
+    }
+
+    for(int j = 0; j < ny; j++){
+        gz[j] = 0.;
+        seta = ((REAL)(j + j_offset[npy]))/NY_GLOBAL;
+        if(MZ_MAX > 0){
+            for(int m = 0; m < MZ_MAX; m++){
+                gz[j] = gz[j] + zl[m] * sin(2. * PI * (m + 1) * (seta + faiz[m]));
+            }
+        }else if(MZ_MAX == 0){
+            gz[j] = 1.;
+        }else{
+            gz[j] = sin(-2. * PI * MZ_MAX * seta);
+        }
+    }
+
+    free(faiz);
+    free(zl);
+}
+
+void bc_user_Compression_conner_init(){
+
+    opencfd_mem_init_boundary();
+
+    REAL(*ub1)[ny]= (REAL(*)[ny])pub1;
+
+    FILE *file;
+    REAL tmp[NY_GLOBAL][4];
+
+    if(my_id == 0){
+        if(INLET_BOUNDARY == 1){
+            char str[100];
+            REAL tmp1;
+            file = fopen("flow1d-inlet.dat", "r");
+            printf("read inlet boundary data: flow1d-inlet.dat\n");
+            fgets(str, 100, file);
+            for(int j = 0; j < NY_GLOBAL; j++){
+                fscanf(file, "%lf%lf%lf%lf%lf\n", &tmp1, &tmp[j][0], &tmp[j][1], &tmp[j][2], &tmp[j][3]);
+            }
+            fclose(file);
+        }else{
+            for(int j = 0; j < NY_GLOBAL; j++){
+                tmp[j][0] = 1.;
+                tmp[j][1] = 1.;
+                tmp[j][2] = 0.;
+                tmp[j][3] = 1.; 
+            }
+        }
+    }
+
+    MPI_Bcast(tmp, 4 * NY_GLOBAL, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+
+    int j1;
+    REAL *xa, *za;
+    xa = (REAL*)malloc(sizeof(REAL)*nx);
+    za = (REAL*)malloc(sizeof(REAL)*nz);
+
+    REAL(*Axx)[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pAxx, nx + 2 * LAP, ny + 2 * LAP);
+	REAL(*Azz)[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pAzz, nx + 2 * LAP, ny + 2 * LAP);
+
+    for(int j = 0; j < ny; j++){
+        j1 = j_offset[npy] + j;
+        for(int i = 0; i < 4; i++){
+            ub1[i][j] = tmp[j1][i];
+        }
+    }
+
+    for(int i = 0; i < nx; i++){
+        if(Axx[LAP][ny + LAP - 1][i + LAP] <= X_UP_BOUNDARY_BEGIN) x_begin = i; 
+        for(int k = 0; k < nz; k++){
+            xa[i] = Axx[LAP][LAP][i + LAP];
+            za[k] = Azz[k + LAP][LAP][LAP];
+        }
+    }
+    get_xs_blow_suction_multiwave(nx, nz, MZMAX, xa, za, SLZ, pfx, pgz, X_DIST_BEGIN, X_DIST_END);
+
+    get_fait_multifrequancy(MTMAX);//Comput TM
+    
+    free(xa);
+    free(za);
+
+    memcpy_All(pub1, pub1_d->ptr, pub1_d->pitch, H2D, ny, 4, 1);
+    memcpy_All(pfx, pfx_d->ptr, pfx_d->pitch, H2D, nx, 1, 1);
+    memcpy_All(pgz, pgz_d->ptr, pgz_d->pitch, H2D, nz, 1, 1);
+
+    
+}
+
+void get_xs_blow_suction_multiwave(int NX, int NZ, int MZ_MAX, REAL *xx,
+REAL *zz, REAL SL, REAL *fx, REAL *gz, REAL DIST_BEGIN, REAL DIST_END){
+    int MZ_MAX1;
+    REAL ztmp, seta;
+    REAL *faiz, *zl;
+    
+    MZ_MAX1 = abs(MZ_MAX);
+    faiz = (REAL*)malloc(sizeof(REAL)*MZ_MAX1);
+    zl = (REAL*)malloc(sizeof(REAL)*MZ_MAX1);
+    
+    ztmp = 0.;
+    
+    for(int k = 0; k < MZ_MAX1; k++){
+        faiz[k] = rand()/(REAL)RAND_MAX;
+        if(k == 0){
+            zl[k] = 1.;
+        }else{
+            zl[k] = zl[k - 1] / 1.25;
+        }
+        ztmp = ztmp + zl[k];
+    }
+    
+    for(int k = 0; k < MZ_MAX1; k++){
+        zl[k] = zl[k] / ztmp;
+    }
+    
+    MPI_Bcast(faiz, MZ_MAX1, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+    
+    for(int i = 0; i < NX; i++){
+        if(xx[i] >= DIST_BEGIN && xx[i] <= DIST_END){
+            //seta = 2. * PI * (xx[i] - DIST_BEGIN)/(DIST_BEGIN - DIST_END);
+            seta = 10. * PI * (xx[i] - DIST_BEGIN)/(DIST_BEGIN - DIST_END);
+            fx[i] = 4. / sqrt(27.) * sin(seta) * (1. - cos(seta));
+        }else{
+            fx[i] = 0.;
+        }
+    }
+
+    for(int k = 0; k < nz; k++){
+        gz[k] = 0.;
+        seta = zz[k] / SL;
+        if(MZ_MAX > 0){
+            for(int m = 0; m < MZ_MAX; m++){
+                gz[k] = gz[k] + zl[m] * sin(2. * PI * (m + 1) * (seta + faiz[m]));
+            }
+        }else if(MZ_MAX == 0){
+            gz[k] = 1.;
+        }else{
+            gz[k] = sin(-2. * PI * MZ_MAX * seta);
+        }
+    }
+
+    free(faiz);
+    free(zl);
+}
+
+
+void get_fait_multifrequancy(int MT_MAX){
+    int Kflag = 0;
+    REAL Ttmp = 0.;
+
+    for(int k = 0; k < MT_MAX; k++){
+        fait[k] = rand()/(REAL)RAND_MAX;
+        TM[0] = 1.;
+        if(k > 0) TM[k] = TM[k - 1] / 1.25;
+        Ttmp = Ttmp + TM[k];
+    }
+
+    for(int k = 0; k < MT_MAX; k++){
+        TM[k] = TM[k] / Ttmp;
+    }
+
+    MPI_Bcast(fait, MT_MAX, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_filtering.cu
+++ b/src/OCFD_filtering.cu
+// Filtering, to remove high-wavenumber oscillations
+// Bogey C, Bailly C,  J. Comput. Phys. 194 (2004) 194-214
+#include "parameters.h"
+#include "parameters_d.h"
+#include "utility.h"
+#include "stdio.h"
+#include "OCFD_mpi.h"
+#include "math.h"
+#include "OCFD_filtering.h"
+#include "OCFD_mpi_dev.h"
+#include "cuda_utility.h"
+
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+void set_para_filtering(){
+    // ib , ie filed without LAP
+    // input ib <= i < ie
+    // ib start from 0
+	if(my_id == 0) printf("filter parameters readed\n");
+		
+	int ib, ie, jb, je, kb, ke;
+    int node_ib, node_ie, node_jb, node_je, node_kb, node_ke;
+    for(int k=0;k<NFiltering;k++){
+		if(Filter_para[k][1] == 1) fiter_judge_X = 1;
+		if(Filter_para[k][2] == 1) fiter_judge_Y = 1;
+		if(Filter_para[k][3] == 1) fiter_judge_Z = 1;
+
+        int flag_i = Filter_para[k][1], flag_j = Filter_para[k][2], flag_k = Filter_para[k][3];
+
+        get_i_node(Filter_para[k][4], &node_ib, &ib);
+        get_i_node(Filter_para[k][5], &node_ie, &ie);
+        get_j_node(Filter_para[k][6], &node_jb, &jb);
+        get_j_node(Filter_para[k][7], &node_je, &je);
+        get_k_node(Filter_para[k][8], &node_kb, &kb);
+        get_k_node(Filter_para[k][9], &node_ke, &ke);
+
+        if(node_ib < npx) ib = 0;
+        if(node_ib > npx) flag_i = 0;
+        if(node_ie > npx) ie = nx;
+        if(node_ie < npx) flag_i = 0;
+        if(npx == 0 && Iperiodic[0] != 1) ib=max(ib, 6);
+        if(npx == NPX0-1 && Iperiodic[0] != 1) ie=min(ie, nx-5);
+
+        if(node_jb < npy) jb = 0;
+        if(node_jb > npy) flag_j = 0;
+        if(node_je > npy) je = ny;
+        if(node_je < npy) flag_j = 0;
+        if(npy == 0 && Iperiodic[1] != 1) jb=max(jb, 6);
+        if(npy == NPY0-1 && Iperiodic[1] != 1) je=min(je, ny-5);
+
+        if(node_kb < npz) kb = 0;
+        if(node_kb > npz) flag_k = 0;
+        if(node_ke > npz) ke = nz;
+        if(node_ke < npz) flag_k = 0;
+        if(npz == 0 && Iperiodic[2] != 1) kb=max(kb, 6);
+        if(npz == NPZ0-1 && Iperiodic[2] != 1) ke=min(ke, nz-5);
+		
+        Filter_para[k][1] = flag_i;
+		Filter_para[k][2] = flag_j;
+		Filter_para[k][3] = flag_k;
+		Filter_para[k][4] = ib;
+        Filter_para[k][5] = ie;
+        Filter_para[k][6] = jb;
+        Filter_para[k][7] = je;
+        Filter_para[k][8] = kb;
+        Filter_para[k][9] = ke;
+    }
+}
+
+void filtering(
+	REAL *pf,
+	REAL *pf0,
+	REAL *pp)
+{
+	int m, ib, ie, jb, je, kb, ke, IF_filter, Filter_scheme;
+	REAL s0, rth;
+
+	IF_filter = 0;
+
+	for (m = 0; m < NFiltering; m++)
+	{
+		if (Istep % Filter_para[m][0] == 0)
+			IF_filter = 1;
+	}
+
+	if (IF_filter == 0)
+		return; // do not filtering in this step
+
+	MPI_Barrier(MPI_COMM_WORLD);
+	//  --------------Filtering --------------------
+
+	if(fiter_judge_X == 1){
+		exchange_boundary_x_packed_dev(pP , pP_d, Iperiodic[0]);
+
+		for(int n=0;n<NVARS;n++){
+			cudaField tmp;
+			int size = pf_lap_d->pitch * ny_2lap * nz_2lap;
+			tmp.pitch = pf_lap_d->pitch;
+			tmp.ptr = pf_lap_d->ptr + n*size;
+			exchange_boundary_x_packed_dev(pP, &tmp, Iperiodic[0]);
+		}
+	}
+
+
+	if(fiter_judge_Y == 1){
+		exchange_boundary_y_packed_dev(pP, pP_d, Iperiodic[1]);
+
+		for(int n=0; n < NVARS; n++){
+			cudaField tmp;
+			int size = pf_lap_d->pitch * ny_2lap * nz_2lap;
+			tmp.pitch = pf_lap_d->pitch;
+			tmp.ptr = pf_lap_d->ptr + n*size;
+			exchange_boundary_y_packed_dev(pP, &tmp, Iperiodic[1]);
+		}
+	}
+
+
+	if(fiter_judge_Z == 1){
+		exchange_boundary_z_packed_dev(pP, pP_d, Iperiodic[2]);
+
+		for(int n=0; n < NVARS; n++){
+			cudaField tmp;
+			int size = pf_lap_d->pitch * ny_2lap * nz_2lap;
+			tmp.pitch = pf_lap_d->pitch;
+			tmp.ptr = pf_lap_d->ptr + n*size;
+			exchange_boundary_z_packed_dev(pP, &tmp, Iperiodic[2]);
+		}
+	}
+
+
+	for (m = 0; m < NFiltering; m++)
+	{
+		if(tt <= Filter_rpara[m][2]){
+			if (Istep % Filter_para[m][0] == 0)
+			{
+                if (my_id == 0)
+			    printf("filtering ......\n");
+				ib = Filter_para[m][4];
+				ie = Filter_para[m][5];
+				jb = Filter_para[m][6];
+				je = Filter_para[m][7];
+				kb = Filter_para[m][8];
+				ke = Filter_para[m][9];
+				Filter_scheme = Filter_para[m][10];
+
+				s0 = Filter_rpara[m][0];
+				rth = Filter_rpara[m][1];
+			
+				if (Filter_scheme == Filter_Fo9p)
+				{
+					filter_x3d(pf, pf0, s0, ib, ie, jb, je, kb, ke);
+				}
+				else if (Filter_scheme == Filter_Fopt_shock)
+				{
+					filter_x3d_shock(pf_d, pf_lap_d, pP_d, s0, rth, ib, ie, jb, je, kb, ke, Filter_para[m][1]);
+				}
+			
+				if (Filter_scheme == Filter_Fo9p)
+				{
+					filter_y3d(pf, pf0, s0, ib, ie, jb, je, kb, ke);
+				}
+				else if (Filter_scheme == Filter_Fopt_shock)
+				{
+					filter_y3d_shock(pf_d, pf_lap_d, pP_d, s0, rth, ib, ie, jb, je, kb, ke, Filter_para[m][2]);
+				}
+			
+				if (Filter_scheme == Filter_Fo9p)
+				{
+					filter_z3d(pf, pf0, s0, ib, ie, jb, je, kb, ke);
+				}
+				else if (Filter_scheme == Filter_Fopt_shock)
+				{
+					filter_z3d_shock(pf_d, pf_lap_d, pP_d, s0, rth, ib, ie, jb, je, kb, ke, Filter_para[m][3]);
+				}
+			}
+		}
+	}
+}
+
+#define CUDA_FUN_UNFINISH \
+if(my_id == 0){\
+	printf("ERROR : %s ( File %s , Line %d ) , is undering developing , current unavailable!!!\n" , __FUNCTION__ , __FILE__,__LINE__);\
+	MPI_Abort(MPI_COMM_WORLD,1);\
+}
+//---------------------------------------------------
+void filter_x3d(
+	REAL *pf,
+	REAL *pf0,
+	REAL s0,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke)
+{
+	CUDA_FUN_UNFINISH
+/*
+	int i, j, k, m, ib1 = ib, ie1 = ie;
+
+	REAL(*f)
+	[nz][ny][nx] = PTR2ARRAY3(pf, nx, ny, nz);
+	REAL(*f0)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pf0, nx + 2 * LAP, ny + 2 * LAP);
+
+	const REAL d0 = 0.243527493120, d1 = -0.204788880640, d2 = 0.120007591680;
+	const REAL d3 = -0.045211119360, d4 = 0.008228661760;
+
+	if (npx == 0 && Iperiodic[0] != 1)
+		ib1 = MAX(ib, 6);
+	if (npx == NPX0 - 1 && Iperiodic[0] != 1)
+		ie1 = MIN(ie, nx - 5);
+
+	for (m = 0; m < NVARS; m++)
+	{IF_Filter_X
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; i < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					f0[k + LAP][j + LAP][i + LAP] = f[m][k][j][i];
+				}
+			}
+		}
+
+		exchange_boundary_x(pf0, Iperiodic[0]);
+
+		for (k = kb + LAP; k <= ke + LAP; k++)
+		{
+			for (j = jb + LAP; j <= je + LAP; j++)
+			{
+				for (i = ib1 + LAP; i <= ie1 + LAP; i++)
+				{
+					f[m][k - LAP][j - LAP][i - LAP] = f0[k][j][i] - s0 * (d0 * f0[k][j][i] + d1 * (f0[k][j][i - 1] + f0[k][j][i + 1]) + d2 * (f0[k][j][i - 2] + f0[k][j][i + 2]) + d3 * (f0[k][j][i - 3] + f0[k][j][i + 3]) + d4 * (f0[k][j][i - 4] + f0[k][j][i + 4]));
+				}
+			}
+		}
+	}
+*/
+}
+
+//---------------------------------------------------
+void filter_y3d(
+	REAL *pf,
+	REAL *pf0,
+	REAL s0,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke)
+{
+	CUDA_FUN_UNFINISH
+/*
+	int i, j, k, m, jb1 = jb, je1 = je;
+
+	REAL(*f)
+	[nz][ny][nx] = PTR2ARRAY3(pf, nx, ny, nz);
+	REAL(*f0)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pf0, nx + 2 * LAP, ny + 2 * LAP);
+
+	const REAL d0 = 0.243527493120, d1 = -0.204788880640, d2 = 0.120007591680;
+	const REAL d3 = -0.045211119360, d4 = 0.008228661760;
+
+	if (npy == 0 && Iperiodic[1] != 1)
+		jb1 = MAX(jb, 6);
+	if (npy == NPY0 - 1 && Iperiodic[1] != 1)
+		je1 = MIN(je, ny - 5);
+
+	for (m = 0; m < NVARS; m++)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; i < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					f0[k + LAP][j + LAP][i + LAP] = f[m][k][j][i];
+				}
+			}
+		}
+
+		exchange_boundary_y(pf0, Iperiodic[1]);
+
+		for (k = kb + LAP; k <= ke + LAP; k++)
+		{
+			for (j = jb1 + LAP; j <= je1 + LAP; j++)
+			{
+				for (i = ib + LAP; i <= ie + LAP; i++)
+				{
+					f[m][k - LAP][j - LAP][i - LAP] = f0[k][j][i] - s0 * (d0 * f0[k][j][i] + d1 * (f0[k][j - 1][i] + f0[k][j + 1][i]) + d2 * (f0[k][j - 2][i] + f0[k][j + 2][i]) + d3 * (f0[k][j - 3][i] + f0[k][j + 3][i]) + d4 * (f0[k][j - 4][i] + f0[k][j + 4][i]));
+				}
+			}
+		}
+	}
+*/
+}
+
+//---------------------------------------------------
+void filter_z3d(
+	REAL *pf,
+	REAL *pf0,
+	REAL s0,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke)
+{
+	CUDA_FUN_UNFINISH
+/*
+	int i, j, k, m, kb1 = kb, ke1 = ke;
+
+	REAL(*f)
+	[nz][ny][nx] = PTR2ARRAY3(pf, nx, ny, nz);
+	REAL(*f0)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pf0, nx + 2 * LAP, ny + 2 * LAP);
+
+	const REAL d0 = 0.243527493120, d1 = -0.204788880640, d2 = 0.120007591680;
+	const REAL d3 = -0.045211119360, d4 = 0.008228661760;
+
+	if (npz == 0 && Iperiodic[2] != 1)
+		kb1 = MAX(kb, 6);
+	if (npz == NPZ0 - 1 && Iperiodic[2] != 1)
+		ke1 = MIN(ke, nz - 5);
+
+	for (m = 0; m < NVARS; m++)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; i < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					f0[k + LAP][j + LAP][i + LAP] = f[m][k][j][i];
+				}
+			}
+		}
+
+		exchange_boundary_z(pf0, Iperiodic[2]);
+
+		for (k = kb1 + LAP; k <= ke1 + LAP; k++)
+		{
+			for (j = jb + LAP; j <= je + LAP; j++)
+			{
+				for (i = ib + LAP; i <= ie + LAP; i++)
+				{
+					f[m][k - LAP][j - LAP][i - LAP] = f0[k][j][i] - s0 * (d0 * f0[k][j][i] + d1 * (f0[k - 1][k][i] + f0[k + 1][j][i]) + d2 * (f0[k - 2][k][i] + f0[k + 2][j][i]) + d3 * (f0[k - 3][k][i] + f0[k + 3][j][i]) + d4 * (f0[k - 4][k][i] + f0[k + 4][j][i]));
+				}
+			}
+		}
+	}
+*/
+}
+//------------------------------------------------------------
+// Shock cpaturing filtering
+
+static __device__ __constant__ REAL filter_shock_c1_d = -0.2103830;
+static __device__ __constant__ REAL filter_shock_c2_d = 0.0396170;
+
+__global__ void filter_x3d_shock_kernel(cudaField P , cudaSoA f_lap , cudaSoA f , REAL rth , REAL s0 , cudaJobPackage job){
+	// eyes on field WITH lap
+	// blockdim.x = ie - ib + 2
+	// job.size.x = filted size
+    unsigned int x = (blockDim.x-2) * blockIdx.x + threadIdx.x - 1 + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	extern __shared__ REAL hh[];
+    if( x<job.end.x + 1 && y<job.end.y && z<job.end.z){
+		{
+			REAL ri;
+			{
+				REAL dp0 , dp1 , dp2 , p_00;
+				REAL p_m2 , p_m1 , p_p1 , p_p2;
+				p_m2 = get_Field_LAP(P , x-2,y,z);
+				p_m1 = get_Field_LAP(P , x-1,y,z);
+				p_00 = get_Field_LAP(P , x  ,y,z);
+				p_p1 = get_Field_LAP(P , x+1,y,z);
+				p_p2 = get_Field_LAP(P , x+2,y,z);
+
+				dp0 = 0.25 * (-p_p1 + 2.0 * p_00 - p_m1);
+				dp1 = 0.25 * (-p_p2 + 2.0 * p_p1 - p_00);
+				dp2 = 0.25 * (-p_00 + 2.0 * p_m1 - p_m2);
+				ri = 0.5 * ((dp0 - dp1) * (dp0 - dp1) + (dp0 - dp2) * (dp0 - dp2)) / (p_00 * p_00) + 1e-16;
+			}
+			unsigned lid = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+			ri = 1.0 - rth / ri;
+			hh[lid] = 0.5 * (ri + fabs(ri));
+		}
+	}
+	__syncthreads();
+    if( threadIdx.x > 1 && x<job.end.x && y<job.end.y && z<job.end.z){
+		REAL Sc1 , Sc2;
+		{
+			unsigned lid =threadIdx.x -1 + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+			Sc1 = 0.5*(hh[lid] + hh[lid+1]) * s0;
+			Sc2 = 0.5*(hh[lid] + hh[lid-1]) * s0;
+		}
+		x -= 1;
+		for(char m = 0;m<NVARS;m++){
+			get_SoA(f , x-LAP , y-LAP , z-LAP , m) = 
+				get_SoA_LAP(f_lap , x,y,z, m) - (
+					Sc1 * ( filter_shock_c1_d * (get_SoA_LAP(f_lap , x+1,y,z, m) - get_SoA_LAP(f_lap, x,y,z, m)    ) + filter_shock_c2_d * (get_SoA_LAP(f_lap ,x+2 , y,z , m) - get_SoA_LAP(f_lap , x-1 , y,z, m)) )
+				  - Sc2 * ( filter_shock_c1_d * (get_SoA_LAP(f_lap , x,y,z, m)   - get_SoA_LAP(f_lap, x-1,y,z , m) ) + filter_shock_c2_d * (get_SoA_LAP(f_lap ,x+1 , y,z , m) - get_SoA_LAP(f_lap , x-2 , y,z, m)) )
+				);
+		}
+	}
+}
+
+void filter_x3d_shock(
+	cudaSoA *pf,
+	cudaSoA *pf0,
+	cudaField *pp,
+	REAL s0,
+	REAL rth,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke,
+	int IF_Filter)
+{
+	if(IF_Filter == 1) 
+	{
+		ib += LAP;
+		ie += LAP;
+		jb += LAP;
+		je += LAP;
+		kb += LAP;
+		ke += LAP;
+
+		dim3 griddim , blockdim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , ie - ib , je - jb , ke - kb);
+		blockdim.x += 2;
+		cudaJobPackage job(dim3(ib , jb , kb),dim3(ie , je , ke));
+
+		CUDA_LAUNCH(( filter_x3d_shock_kernel<<<griddim , blockdim , sizeof(REAL)*(blockdim.x) * blockdim.y * blockdim.z>>>(*pP_d, *pf_lap_d, *pf_d, rth, s0, job) ));
+	}
+}
+
+//---------------------------------------------------
+__global__ void filter_y3d_shock_kernel(cudaField P, cudaSoA f_lap, cudaSoA f, REAL rth, REAL s0, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = (blockDim.y-2) * blockIdx.y + threadIdx.y - 1 + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	extern __shared__ REAL hh[];
+	if( x<job.end.x && y<job.end.y + 1 && z<job.end.z){
+		{
+			REAL ri;
+			{
+				REAL dp0 , dp1 , dp2 , p_00;
+				REAL p_m2 , p_m1 , p_p1 , p_p2;
+				p_m2 = get_Field_LAP(P , x,y-2,z);
+				p_m1 = get_Field_LAP(P , x,y-1,z);
+				p_00 = get_Field_LAP(P , x  ,y,z);
+				p_p1 = get_Field_LAP(P , x,y+1,z);
+				p_p2 = get_Field_LAP(P , x,y+2,z);
+
+				dp0 = 0.25 * (-p_p1 + 2.0 * p_00 - p_m1);
+				dp1 = 0.25 * (-p_p2 + 2.0 * p_p1 - p_00);
+				dp2 = 0.25 * (-p_00 + 2.0 * p_m1 - p_m2);
+				ri = 0.5 * ((dp0 - dp1) * (dp0 - dp1) + (dp0 - dp2) * (dp0 - dp2)) / (p_00 * p_00) + 1e-16;
+			}
+			unsigned lid = threadIdx.y + blockDim.y * (threadIdx.x + blockDim.x * threadIdx.z);
+			ri = 1.0 - rth / ri;
+			hh[lid] = 0.5 * (ri + fabs(ri));
+		}
+	}
+	__syncthreads();
+    if( x<job.end.x && threadIdx.y > 1 && y<job.end.y && z<job.end.z){
+		REAL Sc1 , Sc2;
+		{
+			unsigned lid = threadIdx.y -1 + blockDim.y * (threadIdx.x + blockDim.x * threadIdx.z);
+			Sc1 = 0.5*(hh[lid] + hh[lid+1]) * s0;
+			Sc2 = 0.5*(hh[lid] + hh[lid-1]) * s0;
+		}
+		y -= 1;
+		for(char m = 0;m<NVARS;m++){
+			get_SoA(f , x-LAP , y-LAP , z-LAP , m) = 
+				get_SoA_LAP(f_lap , x,y,z, m) - (
+					Sc1 * ( filter_shock_c1_d * (get_SoA_LAP(f_lap , x,y+1,z, m) - get_SoA_LAP(f_lap, x,y,z, m)    ) + filter_shock_c2_d * (get_SoA_LAP(f_lap ,x , y+2,z , m) - get_SoA_LAP(f_lap , x , y-1,z, m)) )
+				  - Sc2 * ( filter_shock_c1_d * (get_SoA_LAP(f_lap , x,y,z, m)   - get_SoA_LAP(f_lap, x,y-1,z , m) ) + filter_shock_c2_d * (get_SoA_LAP(f_lap ,x , y+1,z , m) - get_SoA_LAP(f_lap , x , y-2,z, m)) )
+				);
+		}
+	}
+}
+
+
+void filter_y3d_shock(
+	cudaSoA *pf,
+	cudaSoA *pf0,
+	cudaField *pp,
+	REAL s0,
+	REAL rth,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke,
+	int IF_Filter)
+{
+	if(IF_Filter == 1){
+		ib += LAP;
+		ie += LAP;
+		jb += LAP;
+		je += LAP;
+		kb += LAP;
+		ke += LAP;
+
+		dim3 griddim, blockdim;
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, ie - ib, je - jb, ke - kb);
+		blockdim.y += 2;
+		cudaJobPackage job(dim3(ib, jb, kb),dim3(ie, je, ke));
+
+		CUDA_LAUNCH((filter_y3d_shock_kernel<<<griddim, blockdim, sizeof(REAL)*(blockdim.x)*blockdim.y*blockdim.z>>>(*pP_d, *pf_lap_d, *pf_d, rth, s0, job)));
+	}
+}
+
+//---------------------------------------------------
+__global__ void filter_z3d_shock_kernel(cudaField P, cudaSoA f_lap, cudaSoA f, REAL rth, REAL s0, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = (blockDim.z-2) * blockIdx.z + threadIdx.z - 1 + job.start.z;
+	extern __shared__ REAL hh[];
+	if( x<job.end.x && y<job.end.y && z<job.end.z + 1){
+		{
+			REAL ri;
+			{
+				REAL dp0 , dp1 , dp2 , p_00;
+				REAL p_m2 , p_m1 , p_p1 , p_p2;
+				p_m2 = get_Field_LAP(P , x,y,z-2);
+				p_m1 = get_Field_LAP(P , x,y,z-1);
+				p_00 = get_Field_LAP(P , x  ,y,z);
+				p_p1 = get_Field_LAP(P , x,y,z+1);
+				p_p2 = get_Field_LAP(P , x,y,z+2);
+
+				dp0 = 0.25 * (-p_p1 + 2.0 * p_00 - p_m1);
+				dp1 = 0.25 * (-p_p2 + 2.0 * p_p1 - p_00);
+				dp2 = 0.25 * (-p_00 + 2.0 * p_m1 - p_m2);
+				ri = 0.5 * ((dp0 - dp1) * (dp0 - dp1) + (dp0 - dp2) * (dp0 - dp2)) / (p_00 * p_00) + 1e-16;
+			}
+			unsigned lid = threadIdx.z + blockDim.z * (threadIdx.x + blockDim.x * threadIdx.y);
+			ri = 1.0 - rth / ri;
+			hh[lid] = 0.5 * (ri + fabs(ri));
+		}
+	}
+	__syncthreads();
+    if( x<job.end.x && y<job.end.y && threadIdx.z > 1 && z<job.end.z){
+		REAL Sc1 , Sc2;
+		{
+			unsigned lid = threadIdx.z -1 + blockDim.z * (threadIdx.x + blockDim.x * threadIdx.y);
+			Sc1 = 0.5*(hh[lid] + hh[lid+1]) * s0;
+			Sc2 = 0.5*(hh[lid] + hh[lid-1]) * s0;
+		}
+		z -= 1;
+		for(char m = 0;m<NVARS;m++){
+			get_SoA(f , x-LAP , y-LAP , z-LAP , m) = 
+				get_SoA_LAP(f_lap , x,y,z, m) - (
+					Sc1 * ( filter_shock_c1_d * (get_SoA_LAP(f_lap , x,y,z+1, m) - get_SoA_LAP(f_lap, x,y,z, m)    ) + filter_shock_c2_d * (get_SoA_LAP(f_lap ,x , y,z+2 , m) - get_SoA_LAP(f_lap , x , y,z-1, m)) )
+				  - Sc2 * ( filter_shock_c1_d * (get_SoA_LAP(f_lap , x,y,z, m)   - get_SoA_LAP(f_lap, x,y,z-1 , m) ) + filter_shock_c2_d * (get_SoA_LAP(f_lap ,x , y,z+1 , m) - get_SoA_LAP(f_lap , x , y,z-2, m)) )
+				);
+		}
+	}
+}
+
+
+void filter_z3d_shock(
+	cudaSoA *pf,
+	cudaSoA *pf0,
+	cudaField *pp,
+	REAL s0,
+	REAL rth,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke,
+	int IF_Filter)
+{
+	if(IF_Filter == 1){
+		ib += LAP;
+		ie += LAP;
+		jb += LAP;
+		je += LAP;
+		kb += LAP;
+		ke += LAP;
+
+		dim3 griddim, blockdim;
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, ie - ib, je - jb, ke - kb);
+		blockdim.z += 2;
+		cudaJobPackage job(dim3(ib, jb, kb),dim3(ie, je, ke));
+
+		CUDA_LAUNCH((filter_y3d_shock_kernel<<<griddim, blockdim, sizeof(REAL)*(blockdim.x)*blockdim.y*blockdim.z>>>(*pP_d, *pf_lap_d, *pf_d, rth, s0, job)));
+	}
+}
+//------------------------------------------------------------
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_flux_charteric.cu
+++ b/src/OCFD_flux_charteric.cu
+#include <math.h>
+
+#include "commen_kernel.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "parameters.h"
+#include "parameters_d.h"
+
+#include "OCFD_flux_charteric.h"
+#include "OCFD_Schemes_hybrid_auto.h"
+#include "OCFD_Schemes.h"
+#include "OCFD_bound_Scheme.h"
+#include "OCFD_warp_shuffle.h"
+
+
+__device__ void put_du_character_p_kernel(dim3 flagxyz, dim3 coords, REAL tmp, REAL tmp_r, REAL tmp_l, cudaSoA du, int num, cudaField Ajac, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+	switch(flagxyz.x){
+		case 1:
+		case 4:
+          {
+		     if(flagxyz.y == 1 && flagxyz.z == 1 && coords.x == 1){
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		     }else{
+                if((flagxyz.y == 1 && coords.x == 1) || (flagxyz.y == 4 && coords.x == job.end.x-job.start.x-1)){
+                    atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*tmp/hx_d);
+                    }else{
+                    atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hx_d);
+		          }
+               }
+
+          }
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hx_d;
+		break;
+
+		case 2:
+		case 5:
+		if(flagxyz.y == 2 && flagxyz.z == 1 && coords.y == 1){
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+               if((flagxyz.y == 2 && coords.y == 1) || (flagxyz.y == 5 && coords.y == job.end.y-job.start.y-1)){
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*tmp/hy_d);
+               }else{  
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hy_d);
+		     }
+          }
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hy_d;
+		break;
+
+		case 3:
+		case 6:
+		if(flagxyz.y == 3 && flagxyz.z == 1 && coords.z == 1){
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+               if((flagxyz.y == 3 && coords.z == 1) || (flagxyz.y == 6 && coords.z == job.end.z-job.start.z-1)){
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*tmp/hz_d);
+               }else{
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hz_d);
+		     }
+          }
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hz_d;
+		break;
+	}
+}
+
+
+__device__ void put_du_character_m_kernel(dim3 flagxyz, dim3 coords, REAL tmp, REAL tmp_r, REAL tmp_l, cudaSoA du, int num, cudaField Ajac, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+	switch(flagxyz.x){
+		case 1:
+		case 4:
+		if(flagxyz.y == 4 && flagxyz.z == 1 && coords.x == job.end.x-job.start.x-1){
+            atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+               if((flagxyz.y == 1 && coords.x == 1) || (flagxyz.y == 4 && coords.x == job.end.x-job.start.x-1)){
+                    atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x-1, y, z)*tmp/hx_d);
+               }else{
+                atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x-1, y, z)*(tmp_r - tmp_l)/hx_d);
+		     }
+          }
+		//get_Field(Ajac, x-LAP-1, y-LAP, z-LAP) = (tmp_r - tmp_l)/hx_d;
+		break;
+
+		case 2:
+		case 5:
+		if(flagxyz.y == 5 && flagxyz.z == 1 && coords.y == job.end.y-job.start.y-1){
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+               if((flagxyz.y == 3 && coords.y == 1) || (flagxyz.y == 5 && coords.y == job.end.y-job.start.y-1)){
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y-1, z)*tmp/hy_d);
+               }else{
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y-1, z)*(tmp_r - tmp_l)/hy_d);
+		     }
+          }
+		//get_Field(Ajac, x-LAP, y-LAP-1, z-LAP) = (tmp_r - tmp_l)/hy_d;
+		break;
+
+		case 3:
+		case 6:
+		if(flagxyz.y == 6 && flagxyz.z == 1 && coords.z == job.end.z-job.start.z-1){
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), 0);
+		}else{
+               if((flagxyz.y == 3 && coords.z == 1) || (flagxyz.y == 6 && coords.z == job.end.z-job.start.z-1)){
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z-1)*tmp/hz_d);
+               }else{
+                atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z-1)*(tmp_r - tmp_l)/hz_d);
+		     }
+          }
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP-1) = (tmp_r - tmp_l)/hz_d;
+		break;
+	}
+}
+
+
+__device__ REAL OCFD_bound_character_kernel_p(dim3 flagxyzb, dim3 coords, REAL *stencil, cudaJobPackage job){
+
+     REAL tmp;
+
+	switch(flagxyzb.y){
+		case 1:
+		{
+			if(coords.x == 1){
+
+                    tmp = (stencil[1] - stencil[0]);
+                    //tmp = (-11.0*stencil[0] + 18.0*stencil[1] - 9.0*stencil[2] + 2.0*stencil[3])/6.0;
+                    //tmp = (-2.0*stencil[0] - 3.0*stencil[1] + 6.0*stencil[2] - stencil[3])/6.0;
+
+			     return tmp;
+
+			}
+		}
+		break;
+
+		case 2:
+		{
+			if(coords.y == 1){
+
+                    tmp = (stencil[1] - stencil[0]);
+                    //tmp = (-11.0*stencil[0] + 18.0*stencil[1] - 9.0*stencil[2] + 2.0*stencil[3])/6.0;
+                    //tmp = (-2.0*stencil[0] - 3.0*stencil[1] + 6.0*stencil[2] - stencil[3])/6.0;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 3:
+		{
+			if(coords.z == 1){
+
+                    tmp = (stencil[1] - stencil[0]);
+                    //tmp = (-11.0*stencil[0] + 18.0*stencil[1] - 9.0*stencil[2] + 2.0*stencil[3])/6.0;
+                    //tmp = (-2.0*stencil[0] - 3.0*stencil[1] - 6.0*stencil[2] - stencil[3])/6.0;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 4:
+		{
+			if(coords.x == job.end.x-job.start.x-1){
+
+                    REAL tmp_r = stencil[0] + 0.5*minmod2(stencil[0] - stencil[-1], stencil[0] - stencil[-1]);
+                    REAL tmp_l = stencil[-1] + 0.5*minmod2(stencil[0] - stencil[-1], stencil[-1] - stencil[-2]);
+
+                    //tmp = (11.0*stencil[0] - 18.0*stencil[-1] + 9.0*stencil[-2] - 2.0*stencil[-3])/6.0;
+                    tmp = tmp_r - tmp_l;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 5:
+		{
+			if(coords.y == job.end.y-job.start.y-1){
+
+                    //tmp = (11.0*stencil[0] - 18.0*stencil[-1] + 9.0*stencil[-2] - 2.0*stencil[-3])/6.0;
+                    REAL tmp_r = stencil[0] + 0.5*minmod2(stencil[0] - stencil[-1], stencil[0] - stencil[-1]);
+                    REAL tmp_l = stencil[-1] + 0.5*minmod2(stencil[0] - stencil[-1], stencil[-1] - stencil[-2]);
+
+                    tmp = tmp_r - tmp_l;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 6:
+		{
+			if(coords.z == job.end.z-job.start.z-1){
+
+                    //tmp = (11.0*stencil[0] - 18.0*stencil[-1] + 9.0*stencil[-2] - 2.0*stencil[-3])/6.0;
+                    REAL tmp_r = stencil[0] + 0.5*minmod2(stencil[0] - stencil[-1], stencil[0] - stencil[-1]);
+                    REAL tmp_l = stencil[-1] + 0.5*minmod2(stencil[0] - stencil[-1], stencil[-1] - stencil[-2]);
+
+                    tmp = tmp_r - tmp_l;
+
+				return tmp;
+
+			}
+		}
+		break;
+	}
+
+     return 0.0;
+}
+
+
+
+__device__ REAL OCFD_bound_character_kernel_m(dim3 flagxyzb, dim3 coords, REAL *stencil, cudaJobPackage job){
+
+     REAL tmp;
+
+	switch(flagxyzb.y){
+		case 1:
+		{
+			if(coords.x == 1){
+
+                    //tmp = (-11.0*stencil[-1] + 18.0*stencil[0] - 9.0*stencil[1] + 2.0*stencil[2])/6.0;
+                    REAL tmp_r = stencil[0] - 0.5*minmod2(stencil[1] - stencil[0], stencil[0] - stencil[-1]);
+                    REAL tmp_l = stencil[-1] - 0.5*minmod2(stencil[0] - stencil[-1], stencil[0] - stencil[-1]);
+
+                    tmp = tmp_r - tmp_l;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 2:
+		{
+			if(coords.y == 1){
+
+                    //tmp = (-11.0*stencil[-1] + 18.0*stencil[0] - 9.0*stencil[1] + 2.0*stencil[2])/6.0;
+                    REAL tmp_r = stencil[0] - 0.5*minmod2(stencil[1] - stencil[0], stencil[0] - stencil[-1]);
+                    REAL tmp_l = stencil[-1] - 0.5*minmod2(stencil[0] - stencil[-1], stencil[0] - stencil[-1]);
+
+                    tmp = tmp_r - tmp_l;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 3:
+		{
+			if(coords.z == 1){
+				
+                    //tmp = (-11.0*stencil[-1] + 18.0*stencil[0] - 9.0*stencil[1] + 2.0*stencil[2])/6.0;
+                    REAL tmp_r = stencil[0] - 0.5*minmod2(stencil[1] - stencil[0], stencil[0] - stencil[-1]);
+                    REAL tmp_l = stencil[-1] - 0.5*minmod2(stencil[0] - stencil[-1], stencil[0] - stencil[-1]);
+
+                    tmp = tmp_r - tmp_l;
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 4:
+		{
+			
+			if(coords.x == job.end.x-job.start.x-1){
+
+                    //tmp = (11.0*stencil[-1] - 18.0*stencil[-2] + 9.0*stencil[-3] - 2.0*stencil[-4])/6.0;
+                    tmp = (stencil[-1] - stencil[-2]);
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 5:
+		{
+
+			if(coords.y == job.end.y-job.start.y-1){
+
+                    //tmp = (11.0*stencil[-1] - 18.0*stencil[-2] + 9.0*stencil[-3] - 2.0*stencil[-4])/6.0;
+                    tmp = (stencil[-1] - stencil[-2]);
+
+				return tmp;
+
+			}
+		}
+		break;
+
+		case 6:
+		{
+
+			if(coords.z == job.end.z-job.start.z-1){
+
+                    //tmp = (11.0*stencil[-1] - 18.0*stencil[-2] + 9.0*stencil[-3] - 2.0*stencil[-4])/6.0;
+                    tmp = (stencil[-1] - stencil[-2]);
+
+				return tmp;
+
+			}
+		}
+		break;
+	}
+
+     return 0.0;
+}
+
+__device__ void get_para_charteric_p_kernel(
+    int flagxyz,
+    dim3 coords,
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    REAL *para_ch,
+    cudaJobPackage job
+){
+    unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+     REAL u1, v1, w1, c1, a1, a2, a3, ss, n1, n2, n3, l1, l2, l3, m1, m2, m3, KK;
+
+    switch(flagxyz){
+         case 1:
+         case 4:
+         {
+              u1 = (get_Field_LAP(u,  x, y, z) + get_Field_LAP(u,  x+1, y, z))*0.5;
+              v1 = (get_Field_LAP(v,  x, y, z) + get_Field_LAP(v,  x+1, y, z))*0.5;
+              w1 = (get_Field_LAP(w,  x, y, z) + get_Field_LAP(w,  x+1, y, z))*0.5;
+              c1 = (get_Field_LAP(cc, x, y, z) + get_Field_LAP(cc, x+1, y, z))*0.5;
+              a1 = (get_Field_LAP(Ax, x, y, z) + get_Field_LAP(Ax, x+1, y, z))*0.5;
+              a2 = (get_Field_LAP(Ay, x, y, z) + get_Field_LAP(Ay, x+1, y, z))*0.5;
+              a3 = (get_Field_LAP(Az, x, y, z) + get_Field_LAP(Az, x+1, y, z))*0.5;
+         }
+         break;
+
+         case 2:
+         case 5:
+         {
+              u1 = (get_Field_LAP(u,  x, y, z) + get_Field_LAP(u,  x, y+1, z))*0.5;
+              v1 = (get_Field_LAP(v,  x, y, z) + get_Field_LAP(v,  x, y+1, z))*0.5;
+              w1 = (get_Field_LAP(w,  x, y, z) + get_Field_LAP(w,  x, y+1, z))*0.5;
+              c1 = (get_Field_LAP(cc, x, y, z) + get_Field_LAP(cc, x, y+1, z))*0.5;
+              a1 = (get_Field_LAP(Ax, x, y, z) + get_Field_LAP(Ax, x, y+1, z))*0.5;
+              a2 = (get_Field_LAP(Ay, x, y, z) + get_Field_LAP(Ay, x, y+1, z))*0.5;
+              a3 = (get_Field_LAP(Az, x, y, z) + get_Field_LAP(Az, x, y+1, z))*0.5;
+         }
+         break;
+
+         case 3:
+         case 6:
+         {
+              u1 = (get_Field_LAP(u,  x, y, z) + get_Field_LAP(u,  x, y, z+1))*0.5;
+              v1 = (get_Field_LAP(v,  x, y, z) + get_Field_LAP(v,  x, y, z+1))*0.5;
+              w1 = (get_Field_LAP(w,  x, y, z) + get_Field_LAP(w,  x, y, z+1))*0.5;
+              c1 = (get_Field_LAP(cc, x, y, z) + get_Field_LAP(cc, x, y, z+1))*0.5;
+              a1 = (get_Field_LAP(Ax, x, y, z) + get_Field_LAP(Ax, x, y, z+1))*0.5;
+              a2 = (get_Field_LAP(Ay, x, y, z) + get_Field_LAP(Ay, x, y, z+1))*0.5;
+              a3 = (get_Field_LAP(Az, x, y, z) + get_Field_LAP(Az, x, y, z+1))*0.5;
+         }
+         break;
+    }
+
+	ss = sqrt(a1*a1 + a2*a2 + a3*a3);
+    n1 = a1/ss;
+    n2 = a2/ss;
+    n3 = a3/ss;
+
+    if(fabs(n3) <= fabs(n2)){
+        ss = sqrt(n1*n1 + n2*n2);
+        l1 = -n2/ss;
+        l2 = n1/ss;
+        l3 = 0.0;
+    }else{
+        ss = sqrt(n1*n1 + n3*n3);
+        l1 = -n3/ss;
+        l2 = 0.0;
+        l3 = n1/ss;
+    }
+
+    m1 = n2*l3 - n3*l2;
+    m2 = n3*l1 - n1*l3;
+    m3 = n1*l2 - n2*l1;
+
+    KK = (Gamma_d - 1.0)/(c1*c1);
+
+    para_ch[0] = u1;
+    para_ch[1] = v1;
+    para_ch[2] = w1;
+    para_ch[3] = c1;
+    para_ch[4] = n1;
+    para_ch[5] = n2;
+    para_ch[6] = n3;
+    para_ch[7] = l1;
+    para_ch[8] = l2;
+    para_ch[9] = l3;
+    para_ch[10] = m1;
+    para_ch[11] = m2;
+    para_ch[12] = m3;
+    para_ch[13] = KK;
+
+}
+
+
+__device__ void get_para_charteric_m_kernel(
+    int flagxyz,
+    dim3 coords,
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    REAL *para_ch,
+    cudaJobPackage job
+){
+    unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+    REAL u1, v1, w1, c1, a1, a2, a3, ss, n1, n2, n3, l1, l2, l3, m1, m2, m3, KK;
+
+    switch(flagxyz){
+         case 1:
+         case 4:
+         {
+              u1 = (get_Field_LAP(u,  x-1, y, z) + get_Field_LAP(u,  x, y, z))*0.5;
+              v1 = (get_Field_LAP(v,  x-1, y, z) + get_Field_LAP(v,  x, y, z))*0.5;
+              w1 = (get_Field_LAP(w,  x-1, y, z) + get_Field_LAP(w,  x, y, z))*0.5;
+              c1 = (get_Field_LAP(cc, x-1, y, z) + get_Field_LAP(cc, x, y, z))*0.5;
+              a1 = (get_Field_LAP(Ax, x-1, y, z) + get_Field_LAP(Ax, x, y, z))*0.5;
+              a2 = (get_Field_LAP(Ay, x-1, y, z) + get_Field_LAP(Ay, x, y, z))*0.5;
+              a3 = (get_Field_LAP(Az, x-1, y, z) + get_Field_LAP(Az, x, y, z))*0.5;
+         }
+         break;
+
+         case 2:
+         case 5:
+         {
+              u1 = (get_Field_LAP(u,  x, y-1, z) + get_Field_LAP(u,  x, y, z))*0.5;
+              v1 = (get_Field_LAP(v,  x, y-1, z) + get_Field_LAP(v,  x, y, z))*0.5;
+              w1 = (get_Field_LAP(w,  x, y-1, z) + get_Field_LAP(w,  x, y, z))*0.5;
+              c1 = (get_Field_LAP(cc, x, y-1, z) + get_Field_LAP(cc, x, y, z))*0.5;
+              a1 = (get_Field_LAP(Ax, x, y-1, z) + get_Field_LAP(Ax, x, y, z))*0.5;
+              a2 = (get_Field_LAP(Ay, x, y-1, z) + get_Field_LAP(Ay, x, y, z))*0.5;
+              a3 = (get_Field_LAP(Az, x, y-1, z) + get_Field_LAP(Az, x, y, z))*0.5;
+         }
+         break;
+
+         case 3:
+         case 6:
+         {
+              u1 = (get_Field_LAP(u,  x, y, z-1) + get_Field_LAP(u,  x, y, z))*0.5;
+              v1 = (get_Field_LAP(v,  x, y, z-1) + get_Field_LAP(v,  x, y, z))*0.5;
+              w1 = (get_Field_LAP(w,  x, y, z-1) + get_Field_LAP(w,  x, y, z))*0.5;
+              c1 = (get_Field_LAP(cc, x, y, z-1) + get_Field_LAP(cc, x, y, z))*0.5;
+              a1 = (get_Field_LAP(Ax, x, y, z-1) + get_Field_LAP(Ax, x, y, z))*0.5;
+              a2 = (get_Field_LAP(Ay, x, y, z-1) + get_Field_LAP(Ay, x, y, z))*0.5;
+              a3 = (get_Field_LAP(Az, x, y, z-1) + get_Field_LAP(Az, x, y, z))*0.5;
+         }
+         break;
+    }
+
+	ss = sqrt(a1*a1 + a2*a2 + a3*a3);
+    n1 = a1/ss;
+    n2 = a2/ss;
+    n3 = a3/ss;
+
+    if(fabs(n3) <= fabs(n2)){
+        ss = sqrt(n1*n1 + n2*n2);
+        l1 = -n2/ss;
+        l2 = n1/ss;
+        l3 = 0.0;
+    }else{
+        ss = sqrt(n1*n1 + n3*n3);
+        l1 = -n3/ss;
+        l2 = 0.0;
+        l3 = n1/ss;
+    }
+
+    m1 = n2*l3 - n3*l2;
+    m2 = n3*l1 - n1*l3;
+    m3 = n1*l2 - n2*l1;
+
+    KK = (Gamma_d - 1.0)/(c1*c1);
+
+    para_ch[0] = u1;
+    para_ch[1] = v1;
+    para_ch[2] = w1;
+    para_ch[3] = c1;
+    para_ch[4] = n1;
+    para_ch[5] = n2;
+    para_ch[6] = n3;
+    para_ch[7] = l1;
+    para_ch[8] = l2;
+    para_ch[9] = l3;
+    para_ch[10] = m1;
+    para_ch[11] = m2;
+    para_ch[12] = m3;
+    para_ch[13] = KK;
+
+}
+
+__device__ void get_du_charteric_p_kernel(dim3 flagxyzb, dim3 coords, cudaSoA du, REAL *stencil, cudaField Ajac, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+     switch(flagxyzb.x){
+	     case 1:
+	     case 4:
+	     if(flagxyzb.x == 4 && coords.x == 1 && flagxyzb.z == 1){
+                 for(int i = 0; i < 5; i++){
+	          	get_SoA(du, x-LAP, y-LAP, z-LAP, i) += 0;
+                 }
+	     }else{ 
+                 for(int i = 0; i < 5; i++){
+	          	get_SoA(du, x-LAP, y-LAP, z-LAP, i) += -get_Field_LAP(Ajac, x, y, z)*stencil[i]/hx_d;
+	          }
+            }
+	     break;
+
+	     case 2:
+	     case 5:
+	     if(flagxyzb.x == 5 && coords.y == 1 && flagxyzb.z == 1){
+               for(int i = 0; i < 5; i++){
+                    get_SoA(du, x-LAP, y-LAP, z-LAP, i) += 0;
+               }
+	     }else{ 
+               for(int i = 0; i < 5; i++){
+                    get_SoA(du, x-LAP, y-LAP, z-LAP, i) += -get_Field_LAP(Ajac, x, y, z)*stencil[i]/hy_d;
+               }
+	     }
+	     break;
+
+	     case 3:
+	     case 6:
+	     if(flagxyzb.x == 6 && coords.z == 1 && flagxyzb.z == 1){
+               for(int i = 0; i < 5; i++){
+                    get_SoA(du, x-LAP, y-LAP, z-LAP, i) += 0;
+               }
+	     }else{ 
+               for(int i = 0; i < 5; i++){
+                    get_SoA(du, x-LAP, y-LAP, z-LAP, i) += -get_Field_LAP(Ajac, x, y, z)*stencil[i]/hz_d;
+               }
+	     }
+	     break;
+	}
+}
+
+
+__device__ void get_du_charteric_m_kernel(dim3 flagxyzb, dim3 coords, cudaSoA du, REAL *stencil, cudaField Ajac, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+    switch(flagxyzb.x){
+	    case 1:
+	    case 4:
+	    if(flagxyzb.y == 4 && coords.x == job.end.x-job.start.x-1 && flagxyzb.z == 1){
+                for(int i = 0; i < 5; i++){
+	         	get_SoA(du, x-LAP-1, y-LAP, z-LAP, i) += 0;
+                }
+	    }else{ 
+                for(int i = 0; i < 5; i++){
+	         	get_SoA(du, x-LAP-1, y-LAP, z-LAP, i) += -get_Field_LAP(Ajac, x-1, y, z)*stencil[i]/hx_d;
+	         }
+           }
+	    break;
+	    case 2:
+	    case 5:
+	    if(flagxyzb.y == 5 && coords.y == job.end.y-job.start.y-1 && flagxyzb.z == 1){
+              for(int i = 0; i < 5; i++){
+                   get_SoA(du, x-LAP, y-LAP-1, z-LAP, i) += 0;
+              }
+	    }else{ 
+              for(int i = 0; i < 5; i++){
+                   get_SoA(du, x-LAP, y-LAP-1, z-LAP, i) += -get_Field_LAP(Ajac, x, y-1, z)*stencil[i]/hy_d;
+              }
+	    }
+	    break;
+
+	    case 3:
+	    case 6:
+	    if(flagxyzb.y == 6 && coords.z == job.end.z-job.start.z-1 && flagxyzb.z == 1){
+              for(int i = 0; i < 5; i++){
+                   get_SoA(du, x-LAP, y-LAP, z-LAP-1, i) += 0;
+              }
+	    }else{ 
+              for(int i = 0; i < 5; i++){
+                   get_SoA(du, x-LAP, y-LAP, z-LAP-1, i) += -get_Field_LAP(Ajac, x, y, z-1)*stencil[i]/hz_d;
+              }
+	    }
+	    break;
+	}
+}
+
+
+__device__ void flux_charteric_ptoc_kernel(
+	REAL *stencil_ch,
+    REAL *para_ch)
+{
+	//Transform variables to character space
+
+    REAL u1 = para_ch[0];
+    REAL v1 = para_ch[1];
+    REAL w1 = para_ch[2];
+    REAL c1 = para_ch[3];
+    REAL n1 = para_ch[4];
+    REAL n2 = para_ch[5];
+    REAL n3 = para_ch[6];
+    REAL l1 = para_ch[7];
+    REAL l2 = para_ch[8];
+    REAL l3 = para_ch[9];
+    REAL m1 = para_ch[10];
+    REAL m2 = para_ch[11];
+    REAL m3 = para_ch[12];
+    REAL KK = para_ch[13];
+    REAL X1 = 1.0/(2.0*c1);;
+
+    REAL un = u1*n1 + v1*n2 + w1*n3;
+    REAL v2 = (u1*u1 + v1*v1 + w1*w1)*0.5;
+
+    //====================S=L（Lift Characteristic Matrix)
+    REAL S11 = 1.0 - KK*v2;
+    REAL S12 = KK*u1;
+    REAL S13 = KK*v1;
+    REAL S14 = KK*w1;
+    REAL S15 = -KK;
+
+    REAL S21 = -(u1*l1 + v1*l2 + w1*l3);
+    REAL S22 = l1;
+    REAL S23 = l2;
+    REAL S24 = l3;
+    REAL S25 = 0.0;
+
+    REAL S31 = -(u1*m1 + v1*m2 + w1*m3);
+    REAL S32 = n2*l3 - n3*l2;
+    REAL S33 = n3*l1 - n1*l3;
+    REAL S34 = n1*l2 - n2*l1;
+    REAL S35 = 0.0;
+
+    REAL S41 = KK*v2*0.5 + X1*un; 
+    REAL S42 = -X1*n1 - KK*u1*0.5;
+    REAL S43 = -X1*n2 - KK*v1*0.5;
+    REAL S44 = -X1*n3 - KK*w1*0.5;
+    REAL S45 = KK*0.5;
+
+    REAL S51 = S41 - 2*X1*un;
+    REAL S52 = 2*X1*n1 + S42;
+    REAL S53 = 2*X1*n2 + S43;
+    REAL S54 = 2*X1*n3 + S44;
+    REAL S55 = S45;
+
+    m1 = S11*stencil_ch[0] + S12*stencil_ch[1] + S13*stencil_ch[2] + S14*stencil_ch[3] + S15*stencil_ch[4];
+    m2 = S21*stencil_ch[0] + S22*stencil_ch[1] + S23*stencil_ch[2] + S24*stencil_ch[3] + S25*stencil_ch[4];
+    m3 = S31*stencil_ch[0] + S32*stencil_ch[1] + S33*stencil_ch[2] + S34*stencil_ch[3] + S35*stencil_ch[4];
+    l1 = S41*stencil_ch[0] + S42*stencil_ch[1] + S43*stencil_ch[2] + S44*stencil_ch[3] + S45*stencil_ch[4];
+    l2 = S51*stencil_ch[0] + S52*stencil_ch[1] + S53*stencil_ch[2] + S54*stencil_ch[3] + S55*stencil_ch[4];
+
+    stencil_ch[0] = m1;
+    stencil_ch[1] = m2;
+    stencil_ch[2] = m3;
+    stencil_ch[3] = l1;
+    stencil_ch[4] = l2;
+        
+}
+
+__device__ void flux_charteric_ctop_kernel(
+	REAL *stencil,
+    REAL *para_ch)
+{
+	//Transform variables to character space
+
+     REAL u1 = para_ch[0];
+     REAL v1 = para_ch[1];
+     REAL w1 = para_ch[2];
+     REAL c1 = para_ch[3];
+     REAL n1 = para_ch[4];
+     REAL n2 = para_ch[5];
+     REAL n3 = para_ch[6];
+     REAL l1 = para_ch[7];
+     REAL l2 = para_ch[8];
+     REAL l3 = para_ch[9];
+     REAL m1 = para_ch[10];
+     REAL m2 = para_ch[11];
+     REAL m3 = para_ch[12];
+     REAL KK = para_ch[13];
+
+    //====================S=L（Lift Characteristic Matrix)
+     REAL S11 = 1.0;
+     REAL S12 = 0.0;
+     REAL S13 = 0.0;
+     REAL S14 = 1.0;
+     REAL S15 = 1.0;
+
+     REAL S21 = u1;
+     REAL S22 = l1;
+     REAL S23 = n2*l3 - n3*l2;
+     REAL S24 = u1 - c1*n1;
+     REAL S25 = 2*u1 - S24;
+
+     REAL S31 = v1;
+     REAL S32 = l2;
+     REAL S33 = n3*l1 - n1*l3;
+     REAL S34 = v1 - c1*n2;
+     REAL S35 = 2*v1 - S34;
+
+     REAL S41 = w1; 
+     REAL S42 = l3;
+     REAL S43 = n1*l2 - n2*l1;
+     REAL S44 = w1 - c1*n3;
+     REAL S45 = 2*w1 - S44;
+
+     REAL S51 = (u1*u1 + v1*v1 + w1*w1)*0.5;
+     REAL H = S51 + 1.0/KK;
+     REAL S52 = u1*l1 + v1*l2 + w1*l3;
+     REAL S53 = u1*m1 + v1*m2 + w1*m3;
+     REAL S54 = H - c1*(u1*n1 + v1*n2 + w1*n3);
+     REAL S55 = 2*H - S54;
+
+     m1 = S11*stencil[0] + S12*stencil[1] + S13*stencil[2] + S14*stencil[3] + S15*stencil[4];
+     m2 = S21*stencil[0] + S22*stencil[1] + S23*stencil[2] + S24*stencil[3] + S25*stencil[4];
+     m3 = S31*stencil[0] + S32*stencil[1] + S33*stencil[2] + S34*stencil[3] + S35*stencil[4];
+     l1 = S41*stencil[0] + S42*stencil[1] + S43*stencil[2] + S44*stencil[3] + S45*stencil[4];
+     l2 = S51*stencil[0] + S52*stencil[1] + S53*stencil[2] + S54*stencil[3] + S55*stencil[4];
+
+     stencil[0] = m1;
+     stencil[1] = m2;
+     stencil[2] = m3;
+     stencil[3] = l1;
+     stencil[4] = l2;
+
+}
+
+__global__ void OCFD_weno7_SYMBO_character_P_kernel(
+    int WENO_LMT_FLAG, 
+    dim3 flagxyzb, 
+    cudaSoA f, 
+    cudaSoA du, 
+    cudaField Ajac, 
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    cudaJobPackage job)
+{
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[40];
+    REAL para_ch[14];
+    REAL tmp[5], tmp_r, tmp_l;
+
+	int flag; 
+    int ia1 = -3; int ib1 = 4;
+
+	for(int i = 0; i < 5; i++){
+
+		flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[8*i], ia1, ib1, sort, job);
+
+        tmp[i] = OCFD_bound_character_kernel_p(flagxyzb, coords, &stencil[8*i-ia1], job);
+    }
+
+
+
+	if(flag != 0){
+
+        {
+            REAL stencil_ch[40];
+
+            get_para_charteric_p_kernel(flagxyzb.x, coords, u, v, w, cc, Ax, Ay, Az, &para_ch[0], job);
+
+            for(int j = 0; j < 8; j++){
+                for(int i = 0; i < 5; i++){
+                     stencil_ch[5*j+i] = stencil[8*i+j];//转置
+                }
+
+                flux_charteric_ptoc_kernel(&stencil_ch[5*j], &para_ch[0]);
+
+                for(int i = 0; i < 5; i++){
+                     stencil[8*i+j] = stencil_ch[5*j+i];//转置
+                }
+            }
+        }//特征重构
+
+        for(int i = 0; i < 5; i++){
+
+			flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[8*i], ia1, ib1, job);
+
+			if(flag != 0) tmp_r = OCFD_weno7_SYMBO_kernel_P(WENO_LMT_FLAG, &stencil[8*i]);
+
+            stencil[i] = tmp_r;
+
+		}
+
+        flux_charteric_ctop_kernel(&stencil[0], &para_ch[0]);
+
+        for(int i = 0; i < 5; i++){
+
+            tmp_r = stencil[i];
+ 
+            tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+ 
+            if(threadIdx.x != 0) put_du_character_p_kernel(flagxyzb, coords, tmp[i], tmp_r, tmp_l, du, i, Ajac, job);
+ 
+        }
+	}
+}
+
+
+
+__global__ void OCFD_weno7_SYMBO_character_M_kernel(
+    int WENO_LMT_FLAG, 
+    dim3 flagxyzb, 
+    cudaSoA f, 
+    cudaSoA du, 
+    cudaField Ajac, 
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    cudaJobPackage job)
+{
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[40];
+    REAL para_ch[14];
+    REAL tmp[5], tmp_r, tmp_l; 
+
+    int flag;
+	int ia1 = -4; int ib1 = 3;
+
+    for(int i = 0; i < 5; i++){
+
+		flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[8*i], ia1, ib1, sort, job);
+
+        tmp[i] = OCFD_bound_character_kernel_m(flagxyzb, coords, &stencil[8*i-ia1], job);
+    }
+
+
+    if(flag != 0){
+
+        {
+            REAL stencil_ch[40];
+            get_para_charteric_m_kernel(flagxyzb.x, coords, u, v, w, cc, Ax, Ay, Az, &para_ch[0], job);
+
+            for(int j = 0; j < 8; j++){
+                for(int i = 0; i < 5; i++){
+                     stencil_ch[5*j+i] = stencil[8*i+j];//转置
+                }
+
+                flux_charteric_ptoc_kernel(&stencil_ch[5*j], &para_ch[0]);
+
+                for(int i = 0; i < 5; i++){
+                     stencil[8*i+j] = stencil_ch[5*j+i];//转置
+                }
+            }
+        }//特征重构
+
+        for(int i = 0; i < 5; i++){
+
+		    flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[8*i], ia1, ib1, job); 
+
+		    if(flag != 0) tmp_r = OCFD_weno7_SYMBO_kernel_M(WENO_LMT_FLAG, &stencil[8*i]);
+
+            stencil[i] = tmp_r;
+
+		}
+
+        flux_charteric_ctop_kernel(&stencil[0], &para_ch[0]);
+
+        for(int i = 0; i < 5; i++){
+
+            tmp_r = stencil[i];
+ 
+            tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+ 
+            if(threadIdx.x != 0) put_du_character_m_kernel(flagxyzb, coords, tmp[i], tmp_r, tmp_l, du, i, Ajac, job);
+ 
+        }
+	}
+}
+
+
+
+__global__ void OCFD_HybridAuto_character_P_kernel(
+    dim3 flagxyzb, 
+    cudaSoA f, 
+    cudaSoA du, 
+    cudaField Ajac, 
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    cudaField_int scheme,
+    cudaJobPackage job)
+{
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[40];
+    REAL para_ch[14];
+    REAL tmp[5], tmp_r, tmp_l;
+    int Hyscheme_flag;
+
+	int flag; 
+    int ia1 = -3; int ib1 = 4;
+
+	for(int i = 0; i < 5; i++){
+
+		flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[8*i], ia1, ib1, sort, job);
+
+        tmp[i] = OCFD_bound_character_kernel_p(flagxyzb, coords, &stencil[8*i-ia1], job);
+    }
+
+
+
+	if(flag != 0){
+
+        Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);
+
+        {
+           REAL stencil_ch[40];
+
+           get_para_charteric_p_kernel(flagxyzb.x, coords, u, v, w, cc, Ax, Ay, Az, &para_ch[0], job);
+
+           for(int j = 0; j < 8; j++){
+                for(int i = 0; i < 5; i++){
+                     stencil_ch[5*j+i] = stencil[8*i+j];//转置
+                }
+
+                flux_charteric_ptoc_kernel(&stencil_ch[5*j], &para_ch[0]);
+
+                for(int i = 0; i < 5; i++){
+                     stencil[8*i+j] = stencil_ch[5*j+i];//转置
+                }
+            }
+        }//特征重构
+
+        for(int i = 0; i < 5; i++){
+
+			flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[8*i], ia1, ib1, job);
+
+            if(flag != 0){
+                if(Hyscheme_flag == 1){
+                     tmp_r = OCFD_OMP6_kernel_P(0, &stencil[8*i]);
+                }else if(Hyscheme_flag == 2){
+                     tmp_r = OCFD_weno7_kernel_P(&stencil[8*i]);
+                }else{
+                     tmp_r = OCFD_NND2_kernel_P(&stencil[8*i+2]);
+                }
+            }
+                
+
+            stencil[i] = tmp_r;
+
+		}
+
+        flux_charteric_ctop_kernel(&stencil[0], &para_ch[0]);
+
+        for(int i = 0; i < 5; i++){
+
+            tmp_r = stencil[i];
+ 
+            tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+ 
+            if(threadIdx.x != 0) put_du_character_p_kernel(flagxyzb, coords, tmp[i], tmp_r, tmp_l, du, i, Ajac, job);
+ 
+        }
+	}
+}
+
+
+__global__ void OCFD_HybridAuto_character_P_Jameson_kernel(
+    dim3 flagxyzb, 
+    cudaSoA f, 
+    cudaSoA du, 
+    cudaField Ajac, 
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    cudaField_int scheme,
+    cudaJobPackage job)
+{
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[40];
+    REAL para_ch[14];
+    REAL tmp[5], tmp_r, tmp_l;
+    int Hyscheme_flag;
+
+	int flag; 
+    int ia1 = -3; int ib1 = 4;
+
+	for(int i = 0; i < 5; i++){
+
+		flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[8*i], ia1, ib1, sort, job);
+
+        tmp[i] = OCFD_bound_character_kernel_p(flagxyzb, coords, &stencil[8*i-ia1], job);
+    }
+
+
+
+	if(flag != 0){
+
+        Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);
+
+        if(Hyscheme_flag != 1){
+
+            {
+                REAL stencil_ch[40];
+
+                get_para_charteric_p_kernel(flagxyzb.x, coords, u, v, w, cc, Ax, Ay, Az, &para_ch[0], job);
+
+                for(int j = 0; j < 8; j++){
+                     for(int i = 0; i < 5; i++){
+                          stencil_ch[5*j+i] = stencil[8*i+j];//转置
+                     }
+
+                     flux_charteric_ptoc_kernel(&stencil_ch[5*j], &para_ch[0]);
+
+                     for(int i = 0; i < 5; i++){
+                          stencil[8*i+j] = stencil_ch[5*j+i];//转置
+                     }
+                }
+            }//特征重构
+        }
+
+        for(int i = 0; i < 5; i++){
+
+			flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[8*i], ia1, ib1, job);
+
+            if(flag != 0){
+                if(Hyscheme_flag == 1){
+                     tmp_r = OCFD_UP7_kernel_P(&stencil[8*i]);
+                }else if(Hyscheme_flag == 2){
+                     tmp_r = OCFD_weno7_kernel_P(&stencil[8*i]);
+                }else{
+                     tmp_r = OCFD_weno5_kernel_P(&stencil[8*i+1]);
+                }
+            }
+                
+
+            stencil[i] = tmp_r;
+
+		}
+
+        if(Hyscheme_flag != 1) flux_charteric_ctop_kernel(&stencil[0], &para_ch[0]);
+
+        for(int i = 0; i < 5; i++){
+
+            tmp_r = stencil[i];
+ 
+            tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+ 
+            if(threadIdx.x != 0) put_du_character_p_kernel(flagxyzb, coords, tmp[i], tmp_r, tmp_l, du, i, Ajac, job);
+ 
+        }
+	}
+}
+
+
+
+__global__ void OCFD_HybridAuto_character_M_kernel(
+    dim3 flagxyzb, 
+    cudaSoA f, 
+    cudaSoA du, 
+    cudaField Ajac, 
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    cudaField_int scheme,
+    cudaJobPackage job)
+{
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[40];
+    REAL para_ch[14];
+    REAL tmp[5], tmp_r, tmp_l;
+    int Hyscheme_flag; 
+
+    int flag;
+	int ia1 = -4; int ib1 = 3;
+
+    for(int i = 0; i < 5; i++){
+
+		flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[8*i], ia1, ib1, sort, job);
+
+        tmp[i] = OCFD_bound_character_kernel_m(flagxyzb, coords, &stencil[8*i-ia1], job);
+    }
+
+
+    if(flag != 0){
+
+        Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
+
+        {
+            REAL stencil_ch[40];
+            get_para_charteric_m_kernel(flagxyzb.x, coords, u, v, w, cc, Ax, Ay, Az, &para_ch[0], job);
+
+            for(int j = 0; j < 8; j++){
+                for(int i = 0; i < 5; i++){
+                     stencil_ch[5*j+i] = stencil[8*i+j];//转置
+                }
+
+                flux_charteric_ptoc_kernel(&stencil_ch[5*j], &para_ch[0]);
+
+                for(int i = 0; i < 5; i++){
+                     stencil[8*i+j] = stencil_ch[5*j+i];//转置
+                }
+            }
+        }//特征重构
+
+        for(int i = 0; i < 5; i++){
+
+		    flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[8*i], ia1, ib1, job); 
+
+            if(flag != 0){
+                if(Hyscheme_flag == 1){
+                     tmp_r = OCFD_OMP6_kernel_M(0, &stencil[8*i]);
+                }else if(Hyscheme_flag == 2){
+                     tmp_r = OCFD_weno7_kernel_M(&stencil[8*i+1]);
+                }else{
+                     tmp_r = OCFD_NND2_kernel_M(&stencil[8*i+3]);
+                }
+            }
+
+            stencil[i] = tmp_r;
+
+		}
+
+        flux_charteric_ctop_kernel(&stencil[0], &para_ch[0]);
+
+        for(int i = 0; i < 5; i++){
+
+            tmp_r = stencil[i];
+ 
+            tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+ 
+            if(threadIdx.x != 0) put_du_character_m_kernel(flagxyzb, coords, tmp[i], tmp_r, tmp_l, du, i, Ajac, job);
+ 
+        }
+	}
+}
+
+
+
+__global__ void OCFD_HybridAuto_character_M_Jameson_kernel(
+    dim3 flagxyzb, 
+    cudaSoA f, 
+    cudaSoA du, 
+    cudaField Ajac, 
+    cudaField u,
+	cudaField v,
+	cudaField w,
+	cudaField cc,
+	cudaField Ax,
+	cudaField Ay,
+	cudaField Az,
+    cudaField_int scheme,
+    cudaJobPackage job)
+{
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[40];
+    REAL para_ch[14];
+    REAL tmp[5], tmp_r, tmp_l;
+    int Hyscheme_flag; 
+
+    int flag;
+	int ia1 = -4; int ib1 = 3;
+
+    for(int i = 0; i < 5; i++){
+
+		flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[8*i], ia1, ib1, sort, job);
+
+        tmp[i] = OCFD_bound_character_kernel_m(flagxyzb, coords, &stencil[8*i-ia1], job);
+    }
+
+
+    if(flag != 0){
+
+        Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
+
+        if(Hyscheme_flag != 1){
+            {
+                REAL stencil_ch[40];
+                get_para_charteric_m_kernel(flagxyzb.x, coords, u, v, w, cc, Ax, Ay, Az, &para_ch[0], job);
+
+                for(int j = 0; j < 8; j++){
+                    for(int i = 0; i < 5; i++){
+                         stencil_ch[5*j+i] = stencil[8*i+j];//转置
+                    }
+
+                    flux_charteric_ptoc_kernel(&stencil_ch[5*j], &para_ch[0]);
+
+                    for(int i = 0; i < 5; i++){
+                         stencil[8*i+j] = stencil_ch[5*j+i];//转置
+                    }
+                }
+            }//特征重构
+          }
+
+          for(int i = 0; i < 5; i++){
+
+		    flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[8*i], ia1, ib1, job); 
+
+            if(flag != 0){
+                if(Hyscheme_flag == 1){
+                     tmp_r = OCFD_UP7_kernel_M(&stencil[8*i]);
+                }else if(Hyscheme_flag == 2){
+                     tmp_r = OCFD_weno7_kernel_M(&stencil[8*i+1]);
+                }else{
+                     tmp_r = OCFD_weno5_kernel_M(&stencil[8*i+2]);
+                }
+            }
+
+            stencil[i] = tmp_r;
+
+		}
+
+        if(Hyscheme_flag != 1) flux_charteric_ctop_kernel(&stencil[0], &para_ch[0]);
+
+        for(int i = 0; i < 5; i++){
+
+            tmp_r = stencil[i];
+ 
+            tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+ 
+            if(threadIdx.x != 0) put_du_character_m_kernel(flagxyzb, coords, tmp[i], tmp_r, tmp_l, du, i, Ajac, job);
+ 
+        }
+	}
+}
--- a/src/OCFD_init.cu
+++ b/src/OCFD_init.cu
+#include "math.h"
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_IO.h"
+#include "io_warp.h"
+#include "OCFD_Comput_Jacobian3d.h"
+#include "OCFD_boundary_init.h"
+#include "OCFD_mpi.h"
+#include "OCFD_mpi_dev.h"
+
+#include "OCFD_Stream.h"
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "commen_kernel.h"
+#include "OCFD_init.h"
+#include "time.h"
+#include "mpi.h"
+#include "OCFD_ana.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+void init()
+{
+
+	hx = 1.0 / (NX_GLOBAL - 1.0);
+	hy = 1.0 / (NY_GLOBAL - 1.0);
+	hz = 1.0 / (NZ_GLOBAL - 1.0);
+
+    opencfd_para_init();
+    opencfd_para_init_dev();
+    
+    bc_parameter();
+    Init_Jacobian3d();
+
+	//----------------------------------------------------------------------------
+    {   
+        cuda_mem_value_init_warp(0.0 , pd_d->ptr , pd_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pu_d->ptr , pu_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pv_d->ptr , pv_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pw_d->ptr , pw_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pT_d->ptr , pT_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pP_d->ptr , pP_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        //cuda_mem_value_init_warp(0.0 , pf_lap_d->ptr , pf_lap_d->pitch , nx_2lap , ny_2lap , nz_2lap*5);
+        cuda_mem_value_init_warp(0.0 , pcc_d->ptr, pcc_d->pitch, nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pf_d->ptr , pf_d->pitch , nx      , ny      , nz*5   );
+        cuda_mem_value_init_warp(0.0 , pfn_d->ptr, pfn_d->pitch, nx      , ny      , nz*5   );
+        cuda_mem_value_init_warp(0.0 , pdu_d->ptr, pdu_d->pitch, nx      , ny      , nz*5   );
+        cuda_mem_value_init_warp(0.0 , pEv1_d->ptr , pEv1_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pEv2_d->ptr , pEv2_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pEv3_d->ptr , pEv3_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(0.0 , pEv4_d->ptr , pEv4_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+
+        REAL * tmp;
+        int tmp_size = (nx+2*LAP)*(ny+2*LAP)*(nz+2*LAP);
+        tmp = pd; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pu; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pv; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pw; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pT; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pP; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pf_lap; for(int i=0;i<tmp_size*5;i++) (*tmp++) = 0.0;
+        tmp = pcc; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pEv1; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pEv2; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pEv3; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pEv4; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+
+        tmp_size = nx*ny*nz*5;
+        tmp = pf ; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pfn; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+        tmp = pdu; for(int i=0;i<tmp_size;i++) (*tmp++) = 0.0;
+    }
+
+	if (Init_stat == 0)
+	{
+        cuda_mem_value_init_warp(1.0 , pd_d->ptr , pd_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(1.0 , pu_d->ptr , pu_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        cuda_mem_value_init_warp(1.0 , pT_d->ptr , pT_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+        
+		Istep = 0;
+		tt = 0.0;
+	}
+	else
+	{
+
+		read_file(0, pd, pu, pv, pw, pT);
+
+        memcpy_inner(pd , pd_d->ptr , pd_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_inner(pu , pu_d->ptr , pu_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_inner(pv , pv_d->ptr , pv_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_inner(pw , pw_d->ptr , pw_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_inner(pT , pT_d->ptr , pT_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+	}
+
+
+	//----------------------------------------------------------------------------
+    // pri to cons
+    {   
+        #ifdef __cplusplus
+        cudaJobPackage job(dim3(0,0,0) , dim3(nx,ny,nz));
+        #else
+        cudaJobPackage job = {0,0,0,nx,ny,nz};
+        #endif
+
+        dim3 blockdim = {BlockDimX , BlockDimY , BlockDimZ};
+        pri_to_cons_kernel_warp(pf_d , pd_d , pu_d , pv_d , pw_d , pT_d ,job , blockdim);
+
+    }
+
+    //Init_Jacobian3d();
+    
+    switch(IBC_USER){
+        case 124:
+        {
+            if(Init_stat == 1) bc_user_Liftbody3d_init();
+            boundary_Jac3d_Liftbody_Ajac();
+        }
+        break;
+
+        case 108:
+        bc_user_Compression_conner_init();
+
+        break;
+    }
+
+}
+// ======================================================================= //
+
+void opencfd_mem_init_all(){
+    opencfd_mem_init();
+    opencfd_mem_init_dev();
+//    opencfd_mem_init_dev2();
+    opencfd_mem_init_Stream();
+    opencfd_mem_init_mpi_dev();
+}
+
+void opencfd_mem_finalize_all(){
+    opencfd_mem_finalize();
+    opencfd_mem_finalize_mpi();
+    opencfd_mem_finalize_Stream();
+    opencfd_mem_finalize_mpi_dev();
+    opencfd_mem_finalize_dev();
+    opencfd_mem_finalize_boundary();
+}
+
+
+void opencfd_mem_init()
+{
+    // space transformation jacobian data
+    int tmp_size = (nx + 2 * LAP) * (ny + 2 * LAP) * (nz + 2 * LAP) * sizeof(REAL);
+
+    pAxx = (REAL *)malloc_me(tmp_size);
+    pAyy = (REAL *)malloc_me(tmp_size);
+    pAzz = (REAL *)malloc_me(tmp_size);
+    pAkx = (REAL *)malloc_me(tmp_size);
+    pAky = (REAL *)malloc_me(tmp_size);
+    pAkz = (REAL *)malloc_me(tmp_size);
+    pAix = (REAL *)malloc_me(tmp_size);
+    pAiy = (REAL *)malloc_me(tmp_size);
+    pAiz = (REAL *)malloc_me(tmp_size);
+    pAsx = (REAL *)malloc_me(tmp_size);
+    pAsy = (REAL *)malloc_me(tmp_size);
+    pAsz = (REAL *)malloc_me(tmp_size);
+    pAjac = (REAL *)malloc_me(tmp_size);
+
+    // computing data without LAP
+    tmp_size = nx * ny * nz * sizeof(REAL);
+
+    pf  = (REAL *)malloc_me(tmp_size*5);
+    pfn = (REAL *)malloc_me(tmp_size*5);
+    pdu = (REAL *)malloc_me(tmp_size*5);
+
+    pAmu = (REAL *)malloc_me(tmp_size);
+
+    pdfp = (REAL*)malloc_me(tmp_size*5);
+    pdfm = (REAL*)malloc_me(tmp_size*5);
+
+    pui = (REAL*)malloc_me(tmp_size);
+    pus = (REAL*)malloc_me(tmp_size);
+    puk = (REAL*)malloc_me(tmp_size);
+    pvk = (REAL*)malloc_me(tmp_size);
+    pvi = (REAL*)malloc_me(tmp_size);
+    pvs = (REAL*)malloc_me(tmp_size);
+    pwk = (REAL*)malloc_me(tmp_size);
+    pwi = (REAL*)malloc_me(tmp_size);
+    pws = (REAL*)malloc_me(tmp_size);
+    pTk = (REAL*)malloc_me(tmp_size);
+    pTi = (REAL*)malloc_me(tmp_size);
+    pTs = (REAL*)malloc_me(tmp_size);
+
+    pQ = (REAL*)malloc_me(tmp_size);
+    pLamda2 = (REAL*)malloc_me(tmp_size);
+
+    // computing data with LAP
+    tmp_size = (nx + 2 * LAP) * (ny + 2 * LAP) * (nz + 2 * LAP) * sizeof(REAL);
+    pd = (REAL *)malloc_me(tmp_size);
+    pu = (REAL *)malloc_me(tmp_size);
+    pv = (REAL *)malloc_me(tmp_size);
+    pw = (REAL *)malloc_me(tmp_size);
+    pT = (REAL *)malloc_me(tmp_size);
+    pP = (REAL *)malloc_me(tmp_size);
+
+    pf_lap = (REAL*)malloc_me(tmp_size*5);
+    pfp = (REAL*)malloc_me(tmp_size*5);
+    pfm = (REAL*)malloc_me(tmp_size*5);
+    pcc = (REAL*)malloc_me(tmp_size);
+
+    pEv1 = (REAL*)malloc_me(tmp_size);
+    pEv2 = (REAL*)malloc_me(tmp_size);
+    pEv3 = (REAL*)malloc_me(tmp_size);
+    pEv4 = (REAL*)malloc_me(tmp_size);
+
+    tmp_size = LAP * ny * nz * sizeof(REAL);
+    malloc_me_Host(pack_send_x, tmp_size);
+    malloc_me_Host(pack_recv_x, tmp_size);
+
+    tmp_size = LAP * nx * nz * sizeof(REAL);
+    malloc_me_Host(pack_send_y, tmp_size);
+    malloc_me_Host(pack_recv_y, tmp_size);
+
+    tmp_size = LAP * nx * ny * sizeof(REAL);
+    malloc_me_Host(pack_send_z, tmp_size);
+    malloc_me_Host(pack_recv_z, tmp_size);
+
+    if(IFLAG_HybridAuto == 1){
+        scheme_x = (int *)malloc_me((nx + 1)*ny*nz*sizeof(int));
+        scheme_y = (int *)malloc_me((ny + 1)*nx*nz*sizeof(int));
+        scheme_z = (int *)malloc_me((nz + 1)*nx*ny*sizeof(int));
+    }
+}
+
+void opencfd_mem_finalize()
+{
+
+    free(pAxx);
+    free(pAyy);
+    free(pAzz);
+    free(pAkx);
+    free(pAky);
+    free(pAkz);
+    free(pAix);
+    free(pAiy);
+    free(pAiz);
+    free(pAsx);
+    free(pAsy);
+    free(pAsz);
+    free(pAjac);
+
+    
+    free(pf);
+    free(pfn);
+    free(pdu);
+
+    free(pAmu);
+
+    free(pdfp);
+    free(pdfm);
+
+    free(pui);
+    free(pus);
+    free(puk);
+    free(pvk);
+    free(pvi);
+    free(pvs);
+    free(pwk);
+    free(pwi);
+    free(pws);
+    free(pTk);
+    free(pTi);
+    free(pTs);
+
+    free(pd);
+    free(pu);
+    free(pv);
+    free(pw);
+    free(pT);
+    free(pP);
+    
+    free(pf_lap);
+    free(pfp);
+    free(pfm);
+    free(pcc);
+    
+    free(pEv1);
+    free(pEv2);
+    free(pEv3);
+    free(pEv4);
+    free(pQ);
+    free(pLamda2);
+
+    cudaFreeHost(pack_send_x);
+    cudaFreeHost(pack_recv_x);
+
+    cudaFreeHost(pack_send_y);
+    cudaFreeHost(pack_recv_y);
+
+    cudaFreeHost(pack_send_z);
+    cudaFreeHost(pack_recv_z);
+
+}
+
+void opencfd_mem_init_boundary(){
+    switch(IBC_USER){
+        case 124:
+
+        pu2d_inlet = (REAL*)malloc_me(sizeof(REAL)*5*nz*ny);
+        pu2d_upper = (REAL*)malloc_me(sizeof(REAL)*5*ny*nx);
+        pv_dist_wall = (REAL*)malloc_me(sizeof(REAL)*ny*nx);
+        pv_dist_coeff = (REAL*)malloc_me(sizeof(REAL)*3*ny*nx);
+        pu_dist_upper = (REAL*)malloc_me(sizeof(REAL)*ny*nx);
+        pfx = (REAL*)malloc_me(sizeof(REAL)*nx);
+        pgz = (REAL*)malloc_me(sizeof(REAL)*ny);
+        TM = (REAL*)malloc_me(sizeof(REAL)*MTMAX);
+        fait = (REAL*)malloc_me(sizeof(REAL)*MTMAX);
+        
+        new_cudaField(&pu2d_inlet_d, ny, nz, 5);
+        new_cudaField(&pu2d_upper_d, nx, ny, 5);
+        //new_cudaField(&pv_dist_wall_d , nx,ny,1);
+        new_cudaField(&pv_dist_coeff_d, nx, ny, 3);
+        new_cudaField(&pu_dist_upper_d, nx, ny, 1);
+        break;
+    
+        case 108:
+        pub1 = (REAL*)malloc_me(sizeof(REAL)*4*ny);
+        pfx = (REAL*)malloc_me(sizeof(REAL)*nx);
+        pgz = (REAL*)malloc_me(sizeof(REAL)*nz);
+    
+        TM = (REAL*)malloc_me(sizeof(REAL)*MTMAX);
+        fait = (REAL*)malloc_me(sizeof(REAL)*MTMAX);
+        //ptmp = (REAL*)malloc_me_Host(sizeof(REAL)*10*nx*nz);
+    
+        new_cudaField(&pub1_d, ny, 4, 1);
+        new_cudaField(&pfx_d, nx, 1, 1);
+        new_cudaField(&pgz_d, nz, 1, 1);
+        //new_cudaField(&ptmp_d, nx, nz, 10);
+        break;
+    }
+}
+
+void opencfd_mem_finalize_boundary(){
+    switch(IBC_USER){
+        case 124:
+        free(pu2d_inlet);
+	    free(pu2d_upper);
+	    free(pv_dist_wall);
+	    free(pv_dist_coeff);
+	    free(pu_dist_upper);
+        free(pfx);
+        free(pgz);
+
+        free(TM);
+        free(fait);
+    
+        delete_cudaField(pu2d_inlet_d);
+	    delete_cudaField(pu2d_upper_d);
+	    //delete_cudaField(pv_dist_wall_d);
+	    delete_cudaField(pv_dist_coeff_d);
+	    //delete_cudaField(pu_dist_upper_d);
+        break;
+
+        case 108:
+        free(pub1);
+        free(pfx);
+        free(pgz);
+
+        free(TM);
+        free(fait);
+
+        delete_cudaField(pub1_d);
+        delete_cudaField(pfx_d);
+        delete_cudaField(pgz_d);
+        break;
+    }
+}
+
+void opencfd_para_init(){
+    Cv = 1.0 / (Gamma * (Gamma - 1.0) * Ama * Ama);
+    Cp = 1.0 / ((Gamma - 1.0) * Ama * Ama);
+    Tsb = 110.4 / Ref_T;
+
+    amu_C0 = 1.0 / Re * (1.0 + 110.4 / Ref_T);
+    tmp0 = 1.0 / (2.0 * Gamma);
+    split_C1 = 2.0 * (Gamma - 1.0); 
+    split_C3 = (3.0 - Gamma) / (2.0 * (Gamma - 1.0));
+
+    nx_lap = nx+LAP;
+    ny_lap = ny+LAP;
+    nz_lap = nz+LAP;
+
+    nx_2lap = nx + 2*LAP;
+    ny_2lap = ny + 2*LAP;
+    nz_2lap = nz + 2*LAP;
+    
+    vis_flux_init_c = Cp/Pr;
+
+    end_step = (int)ceil(end_time/dt);
+
+    // 
+    int flag;
+	MPI_Initialized(&flag);
+	if(!flag){
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    if(my_id == 0) printf("%s finished\n",__func__);
+}
+
+// ======================================================================================== //
+
+void opencfd_mem_init_dev(){
+    new_cudaField(&pAxx_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAyy_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAzz_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAkx_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAky_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAkz_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAix_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAiy_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAiz_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAsx_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAsy_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAsz_d  , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pAjac_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+
+    new_cudaField(&pAmu_d , nx,ny,nz);
+    new_cudaField(&pd_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pu_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pv_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pw_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pT_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField(&pP_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+
+    new_cudaSoA(&pf_d  , nx , ny , nz);
+    new_cudaSoA(&pfn_d , nx , ny , nz);
+    new_cudaSoA(&pdu_d , nx , ny , nz);
+
+    new_cudaField( &pcc_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+
+    new_cudaSoA( &pfp_x_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaSoA( &pfm_x_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+
+    pf_lap_d = (cudaSoA *)malloc(sizeof(cudaSoA));
+    pf_lap_d->ptr = pfp_x_d->ptr;
+    pf_lap_d->pitch = pfp_x_d->pitch;
+
+    new_cudaSoA( &pfp_y_d , nx+2*LAP, ny+2*LAP, nz+2*LAP);
+    new_cudaSoA( &pfm_y_d , nx+2*LAP, ny+2*LAP, nz+2*LAP);
+
+    new_cudaSoA( &pfp_z_d , nx+2*LAP, ny+2*LAP, nz+2*LAP);
+    new_cudaSoA( &pfm_z_d , nx+2*LAP, ny+2*LAP, nz+2*LAP);
+
+    new_cudaField( &vis_u_d , nx , ny , nz);
+    new_cudaField( &vis_v_d , nx , ny , nz);
+    new_cudaField( &vis_w_d , nx , ny , nz);
+    new_cudaField( &vis_T_d , nx , ny , nz);
+
+    new_cudaField( &pEv1_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField( &pEv2_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField( &pEv3_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    new_cudaField( &pEv4_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+
+    new_cudaField( &puk_d , nx , ny , nz);
+    new_cudaField( &pui_d , nx , ny , nz);
+    new_cudaField( &pus_d , nx , ny , nz);
+
+    new_cudaField( &pvk_d , nx , ny , nz);
+    new_cudaField( &pvi_d , nx , ny , nz);
+    new_cudaField( &pvs_d , nx , ny , nz);
+
+    new_cudaField( &pwk_d , nx , ny , nz);
+    new_cudaField( &pwi_d , nx , ny , nz);
+    new_cudaField( &pws_d , nx , ny , nz);
+
+    new_cudaField( &pTk_d , nx , ny , nz);
+    new_cudaField( &pTi_d , nx , ny , nz);
+    new_cudaField( &pTs_d , nx , ny , nz);
+
+    if(IFLAG_HybridAuto == 1){
+        new_cudaField_int(&(HybridAuto.scheme_x), nx+1, ny, nz);
+        new_cudaField_int(&(HybridAuto.scheme_y), nx, ny+1, nz);
+        new_cudaField_int(&(HybridAuto.scheme_z), nx, ny, nz+1);
+        new_cudaField(&pPP_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+    }
+}
+
+
+void opencfd_mem_finalize_dev(){
+    delete_cudaField(pAxx_d  );
+    delete_cudaField(pAyy_d  );
+    delete_cudaField(pAzz_d  );
+    delete_cudaField(pAkx_d  );
+    delete_cudaField(pAky_d  );
+    delete_cudaField(pAkz_d  );
+    delete_cudaField(pAix_d  );
+    delete_cudaField(pAiy_d  );
+    delete_cudaField(pAiz_d  );
+    delete_cudaField(pAsx_d  );
+    delete_cudaField(pAsy_d  );
+    delete_cudaField(pAsz_d  );
+    delete_cudaField(pAjac_d );
+
+    delete_cudaField(pAmu_d);
+
+    delete_cudaField(pd_d);
+    delete_cudaField(pu_d);
+    delete_cudaField(pv_d);
+    delete_cudaField(pw_d);
+    delete_cudaField(pT_d);
+    delete_cudaField(pP_d);
+
+    delete_cudaSoA(pf_d );
+    delete_cudaSoA(pfn_d);
+    delete_cudaSoA(pdu_d);
+
+    //delete_cudaSoA(pf_lap_d);
+
+    delete_cudaSoA(pfp_x_d);
+    delete_cudaSoA(pfm_x_d);
+
+    delete_cudaSoA(pfp_y_d);
+    delete_cudaSoA(pfm_y_d);
+
+    delete_cudaSoA(pfp_z_d);
+    delete_cudaSoA(pfm_z_d);
+    
+    delete_cudaField(pcc_d);
+    
+    //delete_cudaField(pdfp_d);
+    //delete_cudaField(pdfm_d);
+
+    delete_cudaField(vis_u_d);
+    delete_cudaField(vis_v_d);
+    delete_cudaField(vis_w_d);
+    delete_cudaField(vis_T_d);
+
+    delete_cudaField(pEv1_d);
+    delete_cudaField(pEv2_d);
+    delete_cudaField(pEv3_d);
+    delete_cudaField(pEv4_d);
+
+    delete_cudaField( puk_d );
+    delete_cudaField( pui_d );
+    delete_cudaField( pus_d );
+
+    delete_cudaField( pvk_d);
+    delete_cudaField( pvi_d);
+    delete_cudaField( pvs_d);
+
+    delete_cudaField( pwk_d );
+    delete_cudaField( pwi_d );
+    delete_cudaField( pws_d );
+
+    delete_cudaField( pTk_d );
+    delete_cudaField( pTi_d );
+    delete_cudaField( pTs_d );
+    
+}
+
+
+#ifndef __NVCC__
+
+#ifndef HIP_SYMBOL
+#define HIP_SYMBOL( var ) (&var)
+#endif
+
+#else
+
+#ifndef HIP_SYMBOL
+#define HIP_SYMBOL( var ) (var)
+#endif
+
+#endif
+
+
+
+void opencfd_para_init_dev(){
+    cudaMemcpyToSymbol( HIP_SYMBOL(Ama_d) , &Ama , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(Gamma_d) , &Gamma , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(epsl_sw_d) , &epsl_SW , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(Cv_d) , &Cv , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(Cp_d) , &Cp , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(Tsb_d) , &Tsb , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(amu_C0_d) , &amu_C0 , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(dt_d) , &dt , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(split_C1_d) , &split_C1 , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(split_C3_d) , &split_C3 , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+
+
+    cudaMemcpyToSymbol( HIP_SYMBOL(vis_flux_init_c_d) , &vis_flux_init_c , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(nx_d) , &nx , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(ny_d) , &ny , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(nz_d) , &nz , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(nx_lap_d) , &nx_lap , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(ny_lap_d) , &ny_lap , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(nz_lap_d) , &nz_lap , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(nx_2lap_d) , &nx_2lap , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(ny_2lap_d) , &ny_2lap , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(nz_2lap_d) , &nz_2lap , sizeof(unsigned int) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(hx_d) , &hx , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(hy_d) , &hy , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(hz_d) , &hz , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+
+    cudaMemcpyToSymbol( HIP_SYMBOL(WENO_TV_Limiter_d) , &WENO_TV_Limiter , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol( HIP_SYMBOL(WENO_TV_MAX_d) , &WENO_TV_MAX , sizeof(REAL) , 0 , cudaMemcpyHostToDevice);
+
+    int flag;
+	MPI_Initialized(&flag);
+	if(!flag){
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    if(my_id == 0) printf("%s finished\n",__func__);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_mpi.c
+++ b/src/OCFD_mpi.c
+/*OpenCFD ver 1.4, CopyRight by Li Xinliang, LNM, Institute of Mechanics, CAS, Beijing, Email: lixl@lnm.imech.ac.cn
+MPI Subroutines, such as computational domain partation, MPI message send and recv   
+只支持N_MSG_SIZE=0, -2  两种通信方式 
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "utility.h"
+#include "OCFD_mpi.h"
+#include "parameters.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+static REAL *BUFFER_MPI;// Buffer for MPI  message transfer (used by MPI_Bsend)
+static int IBUFFER_SIZE = 100000;
+
+static char mpi_mem_initialized = 0;
+void opencfd_mem_init_mpi(){
+    // mpi corrosponding data
+	if(mpi_mem_initialized == 0){
+		mpi_mem_initialized = 1;
+		i_nn = (int *)malloc(sizeof(int) * NPX0);
+		i_offset = (int *)malloc(sizeof(int) * NPX0);
+
+		j_nn = (int *)malloc(sizeof(int) * NPY0);
+		j_offset = (int *)malloc(sizeof(int) * NPY0);
+
+		k_nn = (int *)malloc(sizeof(int) * NPZ0);
+		k_offset = (int *)malloc(sizeof(int) * NPZ0);
+	}
+}
+
+void opencfd_mem_finalize_mpi(){
+	// 仅仅用于free内存
+	if(mpi_mem_initialized == 1){
+		mpi_mem_initialized = 0;
+		free(i_nn);
+		free(j_nn);
+		free(k_nn);
+
+		free(i_offset);
+		free(j_offset);
+		free(k_offset);
+
+		// MPI_Type_free(&TYPE_LAPX2);
+		// MPI_Type_free(&TYPE_LAPZ2);
+		// MPI_Type_free(&TYPE_LAPY2);
+
+		MPI_Comm_free(&MPI_COMM_X);
+		MPI_Comm_free(&MPI_COMM_Y);
+		MPI_Comm_free(&MPI_COMM_Z);
+		MPI_Comm_free(&MPI_COMM_XY);
+		MPI_Comm_free(&MPI_COMM_XZ);
+		MPI_Comm_free(&MPI_COMM_YZ);
+	}
+}
+
+void mpi_init(int *Argc , char *** Argv){
+	int flag, provided;
+	MPI_Initialized(&flag);
+	if(!flag){
+		MPI_Init_thread(Argc, Argv, MPI_THREAD_MULTIPLE, &provided);
+		if(provided != MPI_THREAD_MULTIPLE){
+                    printf("\033[31mMPI do not Support Multiple thread\033[0m\n");
+                    exit(0);
+		}
+		MPI_Comm_rank(MPI_COMM_WORLD , &my_id);
+
+    	thread_handles = (pthread_t* )malloc(12*sizeof(pthread_t));
+		BUFFER_MPI = (REAL*)malloc(sizeof(REAL)*IBUFFER_SIZE);
+		MPI_Buffer_attach(BUFFER_MPI , IBUFFER_SIZE*sizeof(REAL));
+	}
+}
+void mpi_finalize(){
+	int flag;
+	MPI_Initialized(&flag);
+	if(flag){
+		opencfd_mem_finalize_mpi();
+
+		MPI_Buffer_detach(BUFFER_MPI , &IBUFFER_SIZE);
+		free(BUFFER_MPI);
+		MPI_Finalize();
+
+		free(thread_handles);
+	}
+}
+
+void part()
+{	
+	// Domain partation----------------------------------------------------------------------------
+	int k, ka;
+	int npx1, npy1, npz1, npx2, npy2, npz2;
+
+	// ---------------------------------------------------------------------------------------------
+	int np_size;
+	MPI_Comm_size(MPI_COMM_WORLD, &np_size);
+	if (np_size != NPX0 * NPY0 * NPZ0)
+	{
+		if (my_id == 0)
+			printf("The Number of total Processes is not equal to NPX0*NPY0*NPZ0 !\n");
+		MPI_Finalize();
+		exit(EXIT_FAILURE);
+	}
+
+	// 直接进行手动分块,确定笛卡尔网格对应的进程号.将rank按照x,y,z的维度进行分解
+	npx = my_id % NPX0;  //x方向进程ID
+	npy = my_id % (NPX0 * NPY0) / NPX0;  //y方向进程ID
+	npz = my_id / (NPX0 * NPY0);  //z方向进程ID
+	// ------commonicators-----------------------------------------------------------------------------
+
+	MPI_Comm_split(MPI_COMM_WORLD, npz * NPX0 * NPY0 + npy * NPX0, npx, &MPI_COMM_X); // 1-D
+	MPI_Comm_split(MPI_COMM_WORLD, npz * NPX0 * NPY0 + npx, npy, &MPI_COMM_Y);
+	MPI_Comm_split(MPI_COMM_WORLD, npy * NPX0 + npx, npz, &MPI_COMM_Z);
+	MPI_Comm_split(MPI_COMM_WORLD, npz, npy * NPX0 + npx, &MPI_COMM_XY); // 2-D
+	MPI_Comm_split(MPI_COMM_WORLD, npy, npz * NPX0 + npx, &MPI_COMM_XZ);
+	MPI_Comm_split(MPI_COMM_WORLD, npx, npz * NPY0 + npy, &MPI_COMM_YZ);
+
+	// ------------------------------------------------------------------------------------------------
+	// 均匀分配网格， 如果NX_GLOBAL不能被NPX0整除，将余下的网格点分到靠前的节点
+	// ------------------------------------------------------------------------------------------------
+	nx = NX_GLOBAL / NPX0;
+	ny = NY_GLOBAL / NPY0;
+	nz = NZ_GLOBAL / NPZ0;
+	if (npx < NX_GLOBAL % NPX0)
+		nx = nx + 1;
+	if (npy < NY_GLOBAL % NPY0)
+		ny = ny + 1;
+	if (npz < NZ_GLOBAL % NPZ0)
+		nz = nz + 1;
+
+
+	// ------npx=k的节点上x方向网格点的个数，起始位置
+	// --------------------------------------------------------------------
+	for (k = 0; k < NPX0; k++)
+	{
+		ka = fmin(k, NX_GLOBAL % NPX0);
+		// offset为当前处理器所计算的最大下标
+		// offset实际提供了全局的分块信息
+		i_offset[k] = NX_GLOBAL / NPX0 * k + ka;
+		// nn提供全局的分块大小信息
+		i_nn[k] = NX_GLOBAL / NPX0;
+		if (k < NX_GLOBAL % NPX0)
+			i_nn[k] += 1;
+	}
+	for (k = 0; k < NPY0; k++)
+	{
+		ka = fmin(k, NY_GLOBAL % NPY0);
+		j_offset[k] = NY_GLOBAL / NPY0 * k + ka;
+		j_nn[k] = NY_GLOBAL / NPY0;
+		if (k < NY_GLOBAL % NPY0)
+			j_nn[k] += 1;
+	}
+	for (k = 0; k < NPZ0; k++)
+	{
+		ka = fmin(k, NZ_GLOBAL % NPZ0);
+		k_offset[k] = NZ_GLOBAL / NPZ0 * k + ka;
+		k_nn[k] = NZ_GLOBAL / NPZ0;
+		if (k < NZ_GLOBAL % NPZ0)
+			k_nn[k] += 1;
+	}
+	// --------------------------------------------------------------------------------
+	// -------New Data TYPE------------------------------------------------------------
+	New_MPI_datatype();
+
+	// --------define proc id:  the right, lift, up, bottom, front and backward  procs
+	npx1 = my_mod1(npx - 1, NPX0);
+	npx2 = my_mod1(npx + 1, NPX0);
+	// 利用comm_world的全局下标查找邻居
+	ID_XM1 = npz * (NPX0 * NPY0) + npy * NPX0 + npx1; // -1 proc in x-direction
+	ID_XP1 = npz * (NPX0 * NPY0) + npy * NPX0 + npx2; // +1 proc in x-direction
+	if (Iperiodic[0] == 0 && npx == 0)
+		ID_XM1 = MPI_PROC_NULL; // if not periodic, 0 node donot send mesg to NPX0-1 node
+	if (Iperiodic[0] == 0 && npx == NPX0 - 1)
+		ID_XP1 = MPI_PROC_NULL;
+
+	npy1 = my_mod1(npy - 1, NPY0);
+	npy2 = my_mod1(npy + 1, NPY0);
+	ID_YM1 = npz * (NPX0 * NPY0) + npy1 * NPX0 + npx;
+	ID_YP1 = npz * (NPX0 * NPY0) + npy2 * NPX0 + npx;
+	if (Iperiodic[1] == 0 && npy == 0)
+		ID_YM1 = MPI_PROC_NULL; // if not periodic, 0 node donot send mesg to NPY0-1 node
+	if (Iperiodic[1] == 0 && npy == NPY0 - 1)
+		ID_YP1 = MPI_PROC_NULL;
+
+	npz1 = my_mod1(npz - 1, NPZ0);
+	npz2 = my_mod1(npz + 1, NPZ0);
+	ID_ZM1 = npz1 * (NPX0 * NPY0) + npy * NPX0 + npx;
+	ID_ZP1 = npz2 * (NPX0 * NPY0) + npy * NPX0 + npx;
+	if (Iperiodic[2] == 0 && npz == 0)
+		ID_ZM1 = MPI_PROC_NULL; // if not periodic, 0 node donot send mesg to NPZ0-1 node
+	if (Iperiodic[2] == 0 && npz == NPZ0 - 1)
+		ID_ZP1 = MPI_PROC_NULL;
+
+	// --------------------------------------------------------------
+	MPI_Barrier(MPI_COMM_WORLD);
+}
+
+// --------------------------------------------------------------------------------
+int my_mod1(int i, int n)
+{
+	if (i < 0)
+	{
+		return i + n;
+	}
+	else if (i > n - 1)
+	{
+		return i - n;
+	}
+	else
+	{
+		return i;
+	}
+}
+// -----------------------------------------------------------------------------------------------
+//  Send Recv non-continuous data using derivative data type
+void New_MPI_datatype()
+{
+	MPI_Type_vector(ny, LAP, nx + 2 * LAP, OCFD_DATA_TYPE, &TYPE_LAPX1); //[0:LAP,LAP:ny+LAP,k]
+	MPI_Type_commit(&TYPE_LAPX1);
+	MPI_Type_create_hvector(nz, 1, (nx + 2 * LAP) * (ny + 2 * LAP) * sizeof(REAL), TYPE_LAPX1, &TYPE_LAPX2); //[0:LAP,LAP:ny+LAP,LAP:nz+LAP]
+
+	MPI_Type_vector(LAP, nx, nx + 2 * LAP, OCFD_DATA_TYPE, &TYPE_LAPY1); //[LAP:nx+LAP,0:LAP,K]
+	MPI_Type_commit(&TYPE_LAPY1);
+	MPI_Type_create_hvector(nz, 1, (nx + 2 * LAP) * (ny + 2 * LAP) * sizeof(REAL), TYPE_LAPY1, &TYPE_LAPY2); //[LAP:nx+LAP,0:LAP,LAP:nz+LAP]
+
+	MPI_Type_vector(ny, nx, nx + 2 * LAP, OCFD_DATA_TYPE, &TYPE_LAPZ1);
+	MPI_Type_commit(&TYPE_LAPZ1);
+	MPI_Type_create_hvector(LAP, 1, (nx + 2 * LAP) * (ny + 2 * LAP) * sizeof(REAL), TYPE_LAPZ1, &TYPE_LAPZ2);
+
+	MPI_Type_commit(&TYPE_LAPX2);
+	MPI_Type_commit(&TYPE_LAPY2);
+	MPI_Type_commit(&TYPE_LAPZ2);
+
+	MPI_Type_free(&TYPE_LAPX1);
+	MPI_Type_free(&TYPE_LAPY1);
+	MPI_Type_free(&TYPE_LAPZ1);
+
+	MPI_Barrier(MPI_COMM_WORLD);
+}
+// -----------------------------------------------------------------------------------------------
+
+// -------------------------------------------------------------------------------
+// ----Form a Global index, get the node information and local index
+void get_i_node(int i_global, int *node_i, int *i_local)   //输入全局的坐标，定位它在哪一个节点，并定位它在此节点的当地坐标
+{
+	int ia;
+	*node_i = NPX0 - 1;
+	for (ia = 0; ia < NPX0 - 1; ia++)
+	{
+		if (i_global >= i_offset[ia] && i_global < i_offset[ia + 1])
+			*node_i = ia;
+	}
+	*i_local = i_global - i_offset[*node_i];
+}
+// -------------------------------------------------------------------------------
+void get_j_node(int j_global, int *node_j, int *j_local)
+{
+	int ja;
+	*node_j = NPY0 - 1;
+	for (ja = 0; ja < NPY0 - 1; ja++)
+	{
+		if (j_global >= j_offset[ja] && j_global < j_offset[ja + 1])
+			*node_j = ja;
+	}
+	*j_local = j_global - j_offset[*node_j];
+}
+// -----------------------------------------------------------------------------------
+void get_k_node(int k_global, int *node_k, int *k_local)
+{
+	int ka;
+	*node_k = NPZ0 - 1;
+	for (ka = 0; ka < NPZ0 - 1; ka++)
+	{
+		if (k_global >= k_offset[ka] && k_global < k_offset[ka + 1])//<=  >  <
+			*node_k = ka;
+	}
+	*k_local = k_global - k_offset[*node_k];
+}
+
+// !------------------------------------------------------------------------------------
+int get_id(int npx1, int npy1, int npz1)
+{
+	return npz1 * (NPX0 * NPY0) + npy1 * NPX0 + npx1;
+}
+// -------------------------------------------------------------------------------------
+// Message send and recv at inner boundary (or 'MPI boundary')
+void exchange_boundary_xyz(REAL *pf)
+{
+	exchange_boundary_x(pf, Iperiodic[0]);
+	exchange_boundary_y(pf, Iperiodic[1]);
+	exchange_boundary_z(pf, Iperiodic[2]);
+}
+// ----------------------------------------------------------------------------------------
+void exchange_boundary_x(REAL *pf, int Iperiodic1)
+{
+	if (MSG_BLOCK_SIZE == 0)
+	{
+		exchange_boundary_x_standard(pf, Iperiodic1);
+	}
+	else if (MSG_BLOCK_SIZE == -2)
+	{
+		exchange_boundary_x_deftype(pf);
+	}
+	else
+	{
+		printf("MSG_BLOCK_SIZE error in exchange_boundary_x !");
+	}
+}
+// -----------------------------------------------------------------------------------------------
+void exchange_boundary_y(REAL *pf, int Iperiodic1)
+{
+	if (MSG_BLOCK_SIZE == 0)
+	{
+		exchange_boundary_y_standard(pf, Iperiodic1);
+	}
+	else if (MSG_BLOCK_SIZE == -2)
+	{
+		exchange_boundary_y_deftype(pf);
+	}
+	else
+	{
+		printf("MSG_BLOCK_SIZE error in exchange_boundary_y !");
+	}
+}
+// -----------------------------------------------------------------------------------------------
+void exchange_boundary_z(REAL *pf, int Iperiodic1)
+{
+	if (MSG_BLOCK_SIZE == 0)
+	{
+		exchange_boundary_z_standard(pf, Iperiodic1);
+	}
+	else if (MSG_BLOCK_SIZE == -2)
+	{
+		exchange_boundary_z_deftype(pf);
+	}
+	else
+	{
+		printf("MSG_BLOCK_SIZE error in exchange_boundary_z !");
+	}
+}
+// =========================================================================================================
+// Boundary message communication (exchange message)
+// =========================================================================================================
+// Standard (most used)
+void exchange_boundary_x_standard(REAL *pf, int Iperiodic1)
+{
+	// send and recv mesg, to exchange_boundary array in x direction.
+	// To avoid msg block, cutting long msg to short msgs
+	MPI_Status status;
+	int i, j, k, k1, nsize = LAP * ny * nz;
+	// 1为左侧数据，2为右侧数据
+	REAL tmp_send1[nsize], tmp_send2[nsize], tmp_recv1[nsize], tmp_recv2[nsize];
+	REAL(*f)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pf, nx + 2 * LAP, ny + 2 * LAP);
+
+	if (npx != 0 || Iperiodic1 == 1){
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < LAP; i++)
+				{
+					k1 = k * ny * LAP + j * LAP + i;
+					tmp_send1[k1] = f[k + LAP][j + LAP][i + LAP];
+				}
+			}
+		}
+	}
+	if (npx != NPX0 - 1 || Iperiodic1 == 1){
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < LAP; i++)
+				{
+					k1 = k * ny * LAP + j * LAP + i;
+					tmp_send2[k1] = f[k + LAP][j + LAP][i + nx];
+				}
+			}
+		}
+	}
+	MPI_Sendrecv(tmp_send1, nsize, OCFD_DATA_TYPE, ID_XM1, 9000,
+				 tmp_recv2, nsize, OCFD_DATA_TYPE, ID_XP1, 9000,
+				 MPI_COMM_WORLD, &status);
+	MPI_Sendrecv(tmp_send2, nsize, OCFD_DATA_TYPE, ID_XP1, 8000,
+				 tmp_recv1, nsize, OCFD_DATA_TYPE, ID_XM1, 8000,
+				 MPI_COMM_WORLD, &status);
+
+	//  if not periodic, node npx=0 Do Not need f(i-LAP,j,k)
+	if (npx != 0 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < LAP; i++)
+				{
+					k1 = k * ny * LAP + j * LAP + i;
+					f[k + LAP][j + LAP][i] = tmp_recv1[k1];
+				}
+			}
+		}
+	}
+	if (npx != NPX0 - 1 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < LAP; i++)
+				{
+					k1 = k * ny * LAP + j * LAP + i;
+					f[k + LAP][j + LAP][i + nx + LAP] = tmp_recv2[k1];
+				}
+			}
+		}
+	}
+}
+// ------------------------------------------------------
+void exchange_boundary_y_standard(REAL *pf, int Iperiodic1)
+{
+	MPI_Status status;
+	REAL(*f)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pf, nx + 2 * LAP, ny + 2 * LAP);
+	int i, j, k, k1, nsize = LAP * nz* nx;
+	// 1为下方，2为上方
+	REAL tmp_send1[nsize], tmp_send2[nsize], tmp_recv1[nsize], tmp_recv2[nsize];
+
+	if (npy != 0 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < LAP; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * LAP * nx + j * nx + i;
+					tmp_send1[k1] = f[k + LAP][j + LAP][i + LAP];
+				}
+			}
+		}
+	}
+	if (npy != NPY0 - 1 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < LAP; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * LAP * nx + j * nx + i;
+					tmp_send2[k1] = f[k + LAP][j + ny][i + LAP];
+				}
+			}
+		}
+	}
+	MPI_Sendrecv(tmp_send1, nsize, OCFD_DATA_TYPE, ID_YM1, 9000,
+				 tmp_recv2, nsize, OCFD_DATA_TYPE, ID_YP1, 9000,
+				 MPI_COMM_WORLD, &status);
+	MPI_Sendrecv(tmp_send2, nsize, OCFD_DATA_TYPE, ID_YP1, 8000,
+				 tmp_recv1, nsize, OCFD_DATA_TYPE, ID_YM1, 8000,
+				 MPI_COMM_WORLD, &status);
+
+	if (npy != 0 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < LAP; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * LAP * nx + j * nx + i;
+					f[k + LAP][j][i + LAP] = tmp_recv1[k1];
+				}
+			}
+		}
+	}
+	if (npy != NPY0 - 1 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < nz; k++)
+		{
+			for (j = 0; j < LAP; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * LAP * nx + j * nx + i;
+					f[k + LAP][j + ny + LAP][i + LAP] = tmp_recv2[k1];
+				}
+			}
+		}
+	}
+}
+// ------------------------------------------------------------
+void exchange_boundary_z_standard(REAL *pf, int Iperiodic1)
+{
+	MPI_Status status;
+	REAL(*f)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pf, nx + 2 * LAP, ny + 2 * LAP);
+	int i, j, k, k1, nsize = LAP * nx * ny;
+	// 1为outward , 2为inward
+	REAL tmp_send1[nsize], tmp_send2[nsize], tmp_recv1[nsize], tmp_recv2[nsize];
+
+	if (npz != 0 || Iperiodic1 == 1){
+		for (k = 0; k < LAP; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * nx * ny + j * nx + i;
+					tmp_send1[k1] = f[k + LAP][j + LAP][i + LAP];
+				}
+			}
+		}
+	}
+	if (npz != NPZ0 - 1 || Iperiodic1 == 1){
+		for (k = 0; k < LAP; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * nx * ny + j * nx + i;
+					tmp_send2[k1] = f[k + nz][j + LAP][i + LAP];
+				}
+			}
+		}
+	}
+	MPI_Sendrecv(tmp_send1, nsize, OCFD_DATA_TYPE, ID_ZM1, 9000,
+				 tmp_recv2, nsize, OCFD_DATA_TYPE, ID_ZP1, 9000,
+				 MPI_COMM_WORLD, &status);
+	MPI_Sendrecv(tmp_send2, nsize, OCFD_DATA_TYPE, ID_ZP1, 8000,
+				 tmp_recv1, nsize, OCFD_DATA_TYPE, ID_ZM1, 8000,
+				 MPI_COMM_WORLD, &status);
+
+	if (npz != 0 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < LAP; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * nx * ny + j * nx + i;
+					f[k][j + LAP][i + LAP] = tmp_recv1[k1];
+				}
+			}
+		}
+	}
+
+	if (npz != NPZ0 - 1 || Iperiodic1 == 1)
+	{
+		for (k = 0; k < LAP; k++)
+		{
+			for (j = 0; j < ny; j++)
+			{
+				for (i = 0; i < nx; i++)
+				{
+					k1 = k * nx * ny + j * nx + i;
+					f[k + nz + LAP][j + LAP][i + LAP] = tmp_recv2[k1];
+				}
+			}
+		}
+	}
+}
+// ================================================================================
+// -----------------------------------------------------------------------
+
+//  mpi message send and recv, using user defined data type
+void exchange_boundary_x_deftype(REAL *pf)
+{
+	MPI_Status status;
+	MPI_Sendrecv(pf + idx2int(LAP, LAP, LAP), 1, TYPE_LAPX2, ID_XM1, 9000, pf + idx2int(nx + LAP, LAP, LAP), 1, TYPE_LAPX2, ID_XP1, 9000, MPI_COMM_WORLD, &status);
+	MPI_Sendrecv(pf + idx2int(nx, LAP, LAP), 1, TYPE_LAPX2, ID_XP1, 8000, pf + idx2int(0, LAP, LAP), 1, TYPE_LAPX2, ID_XM1, 8000, MPI_COMM_WORLD, &status);
+}
+// ------------------------------------------------------
+void exchange_boundary_y_deftype(REAL *pf)
+{
+	MPI_Status status;
+	MPI_Sendrecv(pf + idx2int(LAP, LAP, LAP), 1, TYPE_LAPY2, ID_YM1, 9000, pf + idx2int(LAP, ny + LAP, LAP), 1, TYPE_LAPY2, ID_YP1, 9000, MPI_COMM_WORLD, &status);
+	MPI_Sendrecv(pf + idx2int(LAP, ny, LAP), 1, TYPE_LAPY2, ID_YP1, 8000, pf + idx2int(LAP, 0, LAP), 1, TYPE_LAPY2, ID_YM1, 8000, MPI_COMM_WORLD, &status);
+}
+// ------------------------------------------------------------
+void exchange_boundary_z_deftype(REAL *pf)
+{
+	MPI_Status status;
+	MPI_Sendrecv(pf + idx2int(LAP, LAP, LAP), 1, TYPE_LAPZ2, ID_ZM1, 9000, pf + idx2int(LAP, LAP, nz + LAP), 1, TYPE_LAPZ2, ID_ZP1, 9000, MPI_COMM_WORLD, &status);
+	MPI_Sendrecv(pf + idx2int(LAP, LAP, nx), 1, TYPE_LAPZ2, ID_ZP1, 8000, pf + idx2int(LAP, LAP, 0), 1, TYPE_LAPZ2, ID_ZM1, 8000, MPI_COMM_WORLD, &status);
+}
+// -----------------------------------------------------------------------
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_mpi_dev.cu
+++ b/src/OCFD_mpi_dev.cu
+/*OpenCFD ver 1.4, CopyRight by Li Xinliang, LNM, Institute of Mechanics, CAS, Beijing, Email: lixl@lnm.imech.ac.cn
+MPI Subroutines, such as computational domain partation, MPI message send and recv   
+只支持N_MSG_SIZE=0, -2  两种通信方式 
+*/
+
+#include "mpi.h"
+#include "OCFD_mpi.h"
+#include "parameters.h"
+#include "parameters_d.h"
+
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "OCFD_mpi_dev.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+// -------------------------------------------------------------------------------------
+// Message send and recv at inner boundary (or 'MPI boundary')
+void exchange_boundary_xyz_dev(REAL *hostptr , cudaField * devptr)
+{
+	exchange_boundary_x_dev(hostptr , devptr, Iperiodic[0]);
+	exchange_boundary_y_dev(hostptr , devptr, Iperiodic[1]);
+	exchange_boundary_z_dev(hostptr , devptr, Iperiodic[2]);
+}
+// ----------------------------------------------------------------------------------------
+void exchange_boundary_x_dev(REAL *hostptr , cudaField * devptr , int Iperiodic1)
+{
+	exchange_boundary_x_standard_dev(hostptr , devptr, Iperiodic1);
+}
+// -----------------------------------------------------------------------------------------------
+void exchange_boundary_y_dev(REAL *hostptr, cudaField * devptr , int Iperiodic1)
+{
+	exchange_boundary_y_standard_dev(hostptr , devptr, Iperiodic1);
+}
+// -----------------------------------------------------------------------------------------------
+void exchange_boundary_z_dev(REAL *hostptr , cudaField * devptr , int Iperiodic1)
+{
+	exchange_boundary_z_standard_dev(hostptr , devptr, Iperiodic1);
+}
+
+
+
+void exchange_boundary_x_standard_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1)
+{
+	memcpy_bound_x(hostptr , devptr->ptr , devptr->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+	exchange_boundary_x_standard(hostptr , Iperiodic1);
+	memcpy_bound_x(hostptr , devptr->ptr , devptr->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+}
+// ------------------------------------------------------
+void exchange_boundary_y_standard_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1)
+{
+	memcpy_bound_y(hostptr , devptr->ptr , devptr->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+	exchange_boundary_y_standard(hostptr , Iperiodic1);
+	memcpy_bound_y(hostptr , devptr->ptr , devptr->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+}
+// ------------------------------------------------------------
+void exchange_boundary_z_standard_dev(REAL *hostptr, cudaField * devptr , int Iperiodic1)
+{
+	memcpy_bound_z(hostptr , devptr->ptr , devptr->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+	exchange_boundary_z_standard(hostptr , Iperiodic1);
+	memcpy_bound_z(hostptr , devptr->ptr , devptr->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+}
+
+
+/* ===================================================================================================== */
+
+
+static char mpi_dev_mem_initialized = 0;
+static cudaFieldPack *b_xm , *b_xp;
+static cudaFieldPack *b_ym , *b_yp;
+static cudaFieldPack *b_zm , *b_zp;
+
+void mpi_dev_buffer_attach(){
+    // N in bytes
+    new_cudaFieldPack(&b_xm , LAP , ny , nz);
+    new_cudaFieldPack(&b_xp , LAP , ny , nz);
+    new_cudaFieldPack(&b_ym , nx , LAP , nz);
+    new_cudaFieldPack(&b_yp , nx , LAP , nz);
+    new_cudaFieldPack(&b_zm , nx , ny , LAP);
+    new_cudaFieldPack(&b_zp , nx , ny , LAP);
+}
+void mpi_dev_buffer_detach(){
+    delete_cudaFieldPack(b_xm);
+    delete_cudaFieldPack(b_xp);
+    delete_cudaFieldPack(b_ym);
+    delete_cudaFieldPack(b_yp);
+    delete_cudaFieldPack(b_zm);
+    delete_cudaFieldPack(b_zp);
+}
+// unsigned int buffer_align_length = 32; // bytes
+// void dev_buffer_malloc(unsigned int N){
+//     // N in bytes
+//     int n = N/buffer_align_length + 1;
+//     cudaMalloc(n*buffer_align_length);
+// }
+// void dev_buffer_free(){
+
+// }
+// void new_cudaField_buffer(){
+    
+// }
+// void delete_cudaField_buffer(){
+
+// }
+
+void opencfd_mem_init_mpi_dev(){
+    if(mpi_dev_mem_initialized == 0){
+        mpi_dev_mem_initialized = 1;
+        mpi_dev_buffer_attach();
+    }
+}
+void opencfd_mem_finalize_mpi_dev(){
+    if(mpi_dev_mem_initialized == 1){
+        mpi_dev_mem_initialized = 0;
+        mpi_dev_buffer_detach();
+    }
+}
+
+void exchange_boundary_xyz_packed_dev(REAL *hostptr , cudaField * devptr)
+{
+	exchange_boundary_x_packed_dev(hostptr , devptr, Iperiodic[0]);
+	exchange_boundary_y_packed_dev(hostptr , devptr, Iperiodic[1]);
+	exchange_boundary_z_packed_dev(hostptr , devptr, Iperiodic[2]);
+}
+
+void exchange_boundary_xyz_Async_packed_dev(REAL *hostptr , cudaField * devptr , cudaStream_t *stream)
+{
+	exchange_boundary_x_Async_packed_dev(hostptr , devptr, Iperiodic[0], stream);
+	exchange_boundary_y_Async_packed_dev(hostptr , devptr, Iperiodic[1], stream);
+	exchange_boundary_z_Async_packed_dev(hostptr , devptr, Iperiodic[2], stream);
+}
+
+
+__global__ void cudaFieldBoundaryPack_kernel(cudaField data , cudaFieldPack pack , cudaJobPackage job){
+    // eyes on cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+
+        unsigned int pos = x + job.end.x*(y + job.end.y*z);
+        x += job.start.x;
+        y += job.start.y;
+        z += job.start.z;
+        *(pack.ptr + pos) = get_Field_LAP(data , x,y,z);
+        
+    }
+}
+__global__ void cudaFieldBoundaryUnpack_kernel(cudaField data , cudaFieldPack pack , cudaJobPackage job){
+    // eyes on cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+
+        unsigned int pos = x + job.end.x*(y + job.end.y*z);
+        x += job.start.x;
+        y += job.start.y;
+        z += job.start.z;
+        get_Field_LAP(data , x,y,z) = *(pack.ptr + pos);
+        
+    }
+}
+
+
+void cudaFieldBoundaryPack(cudaField * data , cudaFieldPack * pack, cudaJobPackage job_in)
+{
+    // job_in , to packed data , with LAP
+    // job_in.start , job_in.size
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , job_in.end.x , job_in.end.y , job_in.end.z);
+    CUDA_LAUNCH(( cudaFieldBoundaryPack_kernel<<<griddim,blockdim>>>(*data , *pack , job_in) ))
+}
+
+void cudaFieldBoundaryUnpack(cudaField * data , cudaFieldPack * pack , cudaJobPackage job_in){
+    // job_in , to packed data , with LAP
+    // job_in.start , job_in.size
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , job_in.end.x , job_in.end.y , job_in.end.z);
+    CUDA_LAUNCH(( cudaFieldBoundaryUnpack_kernel<<<griddim,blockdim>>>(*data , *pack , job_in) ))
+}
+
+void exchange_boundary_x_packed_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1)
+{   
+    cudaFieldPack * pack;
+    MPI_Status status;
+    int size = LAP*ny*nz;
+    pack = b_xm;
+    cudaJobPackage job(dim3(LAP,LAP,LAP),dim3(LAP,ny,nz));
+
+    if(npx != 0 || Iperiodic1 == 1){
+        cudaFieldBoundaryPack(devptr , pack ,job);
+        CUDA_CALL(( cudaMemcpy(pack_send_x , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost) ))
+    }
+    MPI_Sendrecv(pack_send_x , size , OCFD_DATA_TYPE , ID_XM1 , 1 , pack_recv_x , size , OCFD_DATA_TYPE , ID_XP1 , 1 , MPI_COMM_WORLD , &status);
+	if (npx != NPX0 - 1 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpy(pack->ptr , pack_recv_x , size*sizeof(REAL) , cudaMemcpyHostToDevice) ))
+        job.start.x = nx_lap;
+        cudaFieldBoundaryUnpack(devptr, pack ,job);
+    }
+    
+
+
+    if(npx != NPX0 - 1 || Iperiodic1 == 1){
+        job.start.x = nx;
+        cudaFieldBoundaryPack(devptr , pack ,job);
+        CUDA_CALL(( cudaMemcpy(pack_send_x , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost) ))
+    }
+    MPI_Sendrecv(pack_send_x , size , OCFD_DATA_TYPE , ID_XP1 , 1 , pack_recv_x , size , OCFD_DATA_TYPE , ID_XM1 , 1 , MPI_COMM_WORLD , &status);
+	if (npx != 0 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpy(pack->ptr , pack_recv_x , size*sizeof(REAL) , cudaMemcpyHostToDevice) ))
+        job.start.x = 0;
+        cudaFieldBoundaryUnpack(devptr, pack ,job);
+    }
+    
+}
+
+
+void exchange_boundary_y_packed_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1)
+{   
+    cudaFieldPack * pack;
+    MPI_Status status;
+    int size = LAP*nx*nz;
+    pack = b_ym;
+    cudaJobPackage job(dim3(LAP,LAP,LAP),dim3(nx , LAP ,nz));
+
+    if(npy != 0 || Iperiodic1 == 1){
+        cudaFieldBoundaryPack(devptr , pack ,job);
+        CUDA_CALL(( cudaMemcpy(pack_send_y , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost) ))
+    }
+    MPI_Sendrecv(pack_send_y , size , OCFD_DATA_TYPE , ID_YM1 , 1 , pack_recv_y , size , OCFD_DATA_TYPE , ID_YP1 , 1 , MPI_COMM_WORLD , &status);
+	if (npy != NPY0 - 1 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpy(pack->ptr , pack_recv_y , size*sizeof(REAL) , cudaMemcpyHostToDevice) ))
+        job.start.y = ny_lap;
+        cudaFieldBoundaryUnpack(devptr, pack ,job);
+    }
+    
+
+
+    if(npy != NPY0 - 1 || Iperiodic1 == 1){
+        job.start.y = ny;
+        cudaFieldBoundaryPack(devptr , pack ,job);
+        CUDA_CALL(( cudaMemcpy(pack_send_y , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost) ))
+    }
+    MPI_Sendrecv(pack_send_y , size , OCFD_DATA_TYPE , ID_YP1 , 1 , pack_recv_y , size , OCFD_DATA_TYPE , ID_YM1 , 1 , MPI_COMM_WORLD , &status);
+	if (npy != 0 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpy(pack->ptr , pack_recv_y , size*sizeof(REAL) , cudaMemcpyHostToDevice) ))
+        job.start.y = 0;
+        cudaFieldBoundaryUnpack(devptr, pack ,job);
+    }
+
+}
+
+
+void exchange_boundary_z_packed_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1)
+{   
+    cudaFieldPack * pack;
+    MPI_Status status;
+    int size = LAP*nx*ny;
+    pack = b_zm;
+    cudaJobPackage job(dim3(LAP,LAP,LAP),dim3(nx,ny,LAP));
+
+    if(npz != 0 || Iperiodic1 == 1){
+        cudaFieldBoundaryPack(devptr , pack ,job);
+        CUDA_CALL(( cudaMemcpy(pack_send_z , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost) ))
+    }
+    MPI_Sendrecv(pack_send_z , size , OCFD_DATA_TYPE , ID_ZM1 , 1 , pack_recv_z , size , OCFD_DATA_TYPE , ID_ZP1 , 1 , MPI_COMM_WORLD , &status);
+	if (npz != NPZ0 - 1 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpy(pack->ptr , pack_recv_z , size*sizeof(REAL) , cudaMemcpyHostToDevice) ))
+        job.start.z = nz_lap;
+        cudaFieldBoundaryUnpack(devptr, pack ,job);
+    }
+    
+
+    if(npz != NPZ0 - 1 || Iperiodic1 == 1){
+        job.start.z = nz;
+        cudaFieldBoundaryPack(devptr , pack ,job);
+        CUDA_CALL(( cudaMemcpy(pack_send_z , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost) ))
+    }
+    MPI_Sendrecv(pack_send_z , size , OCFD_DATA_TYPE , ID_ZP1 , 1 , pack_recv_z , size , OCFD_DATA_TYPE , ID_ZM1 , 1 , MPI_COMM_WORLD , &status);
+	if (npz != 0 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpy(pack->ptr , pack_recv_z , size*sizeof(REAL) , cudaMemcpyHostToDevice) ))
+        job.start.z = 0;
+        cudaFieldBoundaryUnpack(devptr, pack ,job);
+    }
+
+}
+
+void cudaFieldBoundaryPack_Async(cudaField * data , cudaFieldPack * pack, cudaJobPackage job_in, cudaStream_t *stream)
+{
+    // job_in , to packed data , with LAP
+    // job_in.start , job_in.size
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , job_in.end.x , job_in.end.y , job_in.end.z);
+    CUDA_LAUNCH(( cudaFieldBoundaryPack_kernel<<<griddim,blockdim,0,*stream>>>(*data , *pack , job_in) ))
+}
+
+void cudaFieldBoundaryUnpack_Async(cudaField * data , cudaFieldPack * pack , cudaJobPackage job_in, cudaStream_t *stream){
+    // job_in , to packed data , with LAP
+    // job_in.start , job_in.size
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , job_in.end.x , job_in.end.y , job_in.end.z);
+    CUDA_LAUNCH(( cudaFieldBoundaryUnpack_kernel<<<griddim,blockdim,0,*stream>>>(*data , *pack , job_in) ))
+}
+
+// 假设 ， 仅仅交换边界
+void exchange_boundary_x_Async_packed_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1 , cudaStream_t *stream)
+{   
+    cudaFieldPack * pack;
+    MPI_Status status;
+    int size = LAP*ny*nz;
+    pack = b_xm;
+    cudaJobPackage job(dim3(LAP,LAP,LAP),dim3(LAP,ny,nz));
+
+    if(npx != 0 || Iperiodic1 == 1){
+        cudaFieldBoundaryPack_Async(devptr , pack ,job, stream);
+        CUDA_CALL(( cudaMemcpyAsync(pack_send_x , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost, *stream) ))
+    }
+    cudaStreamSynchronize(*stream);
+    MPI_Sendrecv(pack_send_x , size , OCFD_DATA_TYPE , ID_XM1 , 1 , pack_recv_x , size , OCFD_DATA_TYPE , ID_XP1 , 1 , MPI_COMM_WORLD , &status);
+	if (npx != NPX0 - 1 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpyAsync(pack->ptr , pack_recv_x , size*sizeof(REAL) , cudaMemcpyHostToDevice, *stream) ))
+        job.start.x = nx_lap;
+        cudaFieldBoundaryUnpack_Async(devptr, pack ,job, stream);
+    }
+    
+
+
+    if(npx != NPX0 - 1 || Iperiodic1 == 1){
+        job.start.x = nx;
+        cudaFieldBoundaryPack_Async(devptr , pack ,job, stream);
+        CUDA_CALL(( cudaMemcpyAsync(pack_send_x , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost, *stream) ))
+    }
+    cudaStreamSynchronize(*stream);
+    MPI_Sendrecv(pack_send_x , size , OCFD_DATA_TYPE , ID_XP1 , 1 , pack_recv_x , size , OCFD_DATA_TYPE , ID_XM1 , 1 , MPI_COMM_WORLD , &status);
+	if (npx != 0 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpyAsync(pack->ptr , pack_recv_x , size*sizeof(REAL) , cudaMemcpyHostToDevice, *stream) ))
+        job.start.x = 0;
+        cudaFieldBoundaryUnpack_Async(devptr, pack ,job, stream);
+    }
+    
+}
+
+void exchange_boundary_y_Async_packed_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1 , cudaStream_t *stream)
+{   
+    cudaFieldPack * pack;
+    MPI_Status status;
+    int size = LAP*nx*nz;
+    pack = b_ym;
+    cudaJobPackage job(dim3(LAP,LAP,LAP),dim3(nx , LAP ,nz));
+
+    if(npy != 0 || Iperiodic1 == 1){
+        cudaFieldBoundaryPack_Async(devptr , pack ,job, stream);
+        CUDA_CALL(( cudaMemcpyAsync(pack_send_y , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost, *stream) ))
+    }
+    cudaStreamSynchronize(*stream);
+    MPI_Sendrecv(pack_send_y , size , OCFD_DATA_TYPE , ID_YM1 , 1 , pack_recv_y , size , OCFD_DATA_TYPE , ID_YP1 , 1 , MPI_COMM_WORLD , &status);
+	if (npy != NPY0 - 1 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpyAsync(pack->ptr , pack_recv_y , size*sizeof(REAL) , cudaMemcpyHostToDevice, *stream) ))
+        job.start.y = ny_lap;
+        cudaFieldBoundaryUnpack_Async(devptr, pack ,job, stream);
+    }
+    
+
+
+    if(npy != NPY0 - 1 || Iperiodic1 == 1){
+        job.start.y = ny;
+        cudaFieldBoundaryPack_Async(devptr , pack ,job, stream);
+        CUDA_CALL(( cudaMemcpyAsync(pack_send_y , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost, *stream) ))
+    }
+    cudaStreamSynchronize(*stream);
+    MPI_Sendrecv(pack_send_y , size , OCFD_DATA_TYPE , ID_YP1 , 1 , pack_recv_y , size , OCFD_DATA_TYPE , ID_YM1 , 1 , MPI_COMM_WORLD , &status);
+	if (npy != 0 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpyAsync(pack->ptr , pack_recv_y , size*sizeof(REAL) , cudaMemcpyHostToDevice, *stream) ))
+        job.start.y = 0;
+        cudaFieldBoundaryUnpack_Async(devptr, pack ,job, stream);
+    }
+
+}
+
+void exchange_boundary_z_Async_packed_dev(REAL *hostptr , cudaField * devptr, int Iperiodic1 , cudaStream_t *stream)
+{   
+    cudaFieldPack * pack;
+    MPI_Status status;
+    int size = LAP*nx*ny;
+    pack = b_zm;
+    cudaJobPackage job(dim3(LAP,LAP,LAP),dim3(nx,ny,LAP));
+
+    if(npz != 0 || Iperiodic1 == 1){
+        cudaFieldBoundaryPack_Async(devptr , pack ,job, stream);
+        CUDA_CALL(( cudaMemcpyAsync(pack_send_z , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost, *stream) ))
+    }
+    cudaStreamSynchronize(*stream);
+    MPI_Sendrecv(pack_send_z , size , OCFD_DATA_TYPE , ID_ZM1 , 1 , pack_recv_z , size , OCFD_DATA_TYPE , ID_ZP1 , 1 , MPI_COMM_WORLD , &status);
+	if (npz != NPZ0 - 1 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpyAsync(pack->ptr , pack_recv_z , size*sizeof(REAL) , cudaMemcpyHostToDevice, *stream) ))
+        job.start.z = nz_lap;
+        cudaFieldBoundaryUnpack_Async(devptr, pack ,job, stream);
+    }
+    
+
+
+    if(npz != NPZ0 - 1 || Iperiodic1 == 1){
+        job.start.z = nz;
+        cudaFieldBoundaryPack_Async(devptr , pack ,job, stream);
+        CUDA_CALL(( cudaMemcpyAsync(pack_send_z , pack->ptr , size*sizeof(REAL) , cudaMemcpyDeviceToHost, *stream) ))
+    }
+    cudaStreamSynchronize(*stream);
+    MPI_Sendrecv(pack_send_z , size , OCFD_DATA_TYPE , ID_ZP1 , 1 , pack_recv_z , size , OCFD_DATA_TYPE , ID_ZM1 , 1 , MPI_COMM_WORLD , &status);
+	if (npz != 0 || Iperiodic1 == 1){
+        CUDA_CALL(( cudaMemcpyAsync(pack->ptr , pack_recv_z , size*sizeof(REAL) , cudaMemcpyHostToDevice, *stream) ))
+        job.start.z = 0;
+        cudaFieldBoundaryUnpack_Async(devptr, pack ,job, stream);
+    }
+
+}
+
+
+/* 
+
+    switch(dir){
+        case MPI_X_DIR : {
+            MPI_Sendrecv(pack_send , size , OCFD_DATA_TYPE , ID_XM1 , 1 , pack_recv , size , OCFD_DATA_TYPE , ID_XP1 , 1 , MPI_COMM_WORKD , &status);
+            break;
+        }
+        case MPI_Y_DIR : {
+            MPI_Sendrecv(pack_send , size , OCFD_DATA_TYPE , ID_XM1 , 1 , pack_recv , size , OCFD_DATA_TYPE , ID_XP1 , 1 , MPI_COMM_WORKD , &status);
+            break;
+        }
+        case MPI_Z_DIR : {
+            MPI_Sendrecv(pack_send , size , OCFD_DATA_TYPE , ID_XM1 , 1 , pack_recv , size , OCFD_DATA_TYPE , ID_XP1 , 1 , MPI_COMM_WORKD , &status);
+            break;
+        }
+    }
+
+*/
+
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_split.cu
+++ b/src/OCFD_split.cu
+// =============================================================================================
+//  含三维Jocabian变换
+
+#include <math.h>
+#include "utility.h"
+#include "OCFD_split.h"
+#include "parameters.h"
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+__global__ void split_Jac3d_Stager_Warming_ker(cudaField d0, cudaField u0, cudaField v0, cudaField w0, cudaField cc0, cudaSoA fp, cudaSoA fm, cudaField Akx, cudaField Aky, cudaField Akz, REAL tmp0, REAL split_C1, REAL split_C3, cudaJobPackage job)
+{
+	// eyes on cells WITH LAPs
+
+	unsigned int x = threadIdx.x + blockIdx.x*blockDim.x + job.start.x;
+	unsigned int y = threadIdx.y + blockIdx.y*blockDim.y + job.start.y;
+	unsigned int z = threadIdx.z + blockIdx.z*blockDim.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+        REAL A1, A2, A3, ss;
+	    REAL E1, E2, E3, E1P, E2P, E3P, E1M, E2M, E3M;
+	    REAL uc1, uc2, vc1, vc2, wc1, wc2, vvc1, vvc2, vv, W2;
+
+        REAL u, v, w, cc, d;
+
+		A1 = get_Field_LAP(Ax, x, y, z);
+		A2 = get_Field_LAP(Ay, x, y, z);
+		A3 = get_Field_LAP(Az, x, y, z);
+
+		ss = sqrt(A1*A1 + A2*A2 + A3*A3);
+
+        d = get_Field_LAP(d0, x, y, z);
+        u = get_Field_LAP(u0, x, y, z);
+        v = get_Field_LAP(v0, x, y, z);
+        w = get_Field_LAP(w0, x, y, z);
+        cc = get_Field_LAP(cc0, x, y, z);
+
+		E1 = A1*u + A2*v + A3*w;
+		E2 = E1 - cc*ss;
+		E3 = E1 + cc*ss;
+
+        ss = 1.0/ss;
+
+        A1 *= ss;
+		A2 *= ss;
+		A3 *= ss;
+
+        tmp0 = d*tmp0;
+
+		E1P = (E1 + sqrt(E1 * E1 + epsl_sw_d * epsl_sw_d)) * 0.50;
+		E2P = (E2 + sqrt(E2 * E2 + epsl_sw_d * epsl_sw_d)) * 0.50;
+		E3P = (E3 + sqrt(E3 * E3 + epsl_sw_d * epsl_sw_d)) * 0.50;
+
+		E1M = E1 - E1P;
+		E2M = E2 - E2P;
+		E3M = E3 - E3P;
+		// ----------------------------------------
+
+		uc1 = u - cc * A1;
+		uc2 = u + cc * A1;
+		vc1 = v - cc * A2;
+		vc2 = v + cc * A2;
+		wc1 = w - cc * A3;
+		wc2 = w + cc * A3;
+		vvc1 = (uc1 * uc1 + vc1 * vc1 + wc1 * wc1) * 0.50;
+		vvc2 = (uc2 * uc2 + vc2 * vc2 + wc2 * wc2) * 0.50;
+		vv = (Gamma_d - 1.0) * (u * u + v * v + w * w );
+		W2 = split_C3 * cc * cc;
+
+		
+		get_SoA_LAP(fp, x, y, z, 0) = tmp0 * (split_C1 * E1P + E2P + E3P);
+		get_SoA_LAP(fp, x, y, z, 1) = tmp0 * (split_C1 * E1P * u + E2P * uc1 + E3P * uc2);
+		get_SoA_LAP(fp, x, y, z, 2) = tmp0 * (split_C1 * E1P * v + E2P * vc1 + E3P * vc2);
+		get_SoA_LAP(fp, x, y, z, 3) = tmp0 * (split_C1 * E1P * w + E2P * wc1 + E3P * wc2);
+		get_SoA_LAP(fp, x, y, z, 4) = tmp0 * (E1P * vv + E2P * vvc1 + E3P * vvc2 + W2 * (E2P + E3P));
+		// --------------------------------------------------------
+
+		get_SoA_LAP(fm, x, y, z, 0) = tmp0 * (split_C1 * E1M + E2M + E3M);
+		get_SoA_LAP(fm, x, y, z, 1) = tmp0 * (split_C1 * E1M * u + E2M * uc1 + E3M * uc2);
+		get_SoA_LAP(fm, x, y, z, 2) = tmp0 * (split_C1 * E1M * v + E2M * vc1 + E3M * vc2);
+		get_SoA_LAP(fm, x, y, z, 3) = tmp0 * (split_C1 * E1M * w + E2M * wc1 + E3M * wc2);
+		get_SoA_LAP(fm, x, y, z, 4) = tmp0 * (E1M * vv + E2M * vvc1 + E3M * vvc2 + W2 * (E2M + E3M));
+	}
+}*/
+
+__device__ void Stager_Warming_ker(unsigned int x, unsigned int y, unsigned int z, REAL tmp0, REAL u, REAL v, REAL w, REAL cc, cudaSoA fp, cudaSoA fm, REAL A1, REAL A2, REAL A3, REAL split_C1, REAL split_C3){
+        REAL ss;
+	    REAL E1, E2, E3, E1P, E2P, E3P, E1M, E2M, E3M;
+	    REAL uc1, uc2, vc1, vc2, wc1, wc2, vvc1, vvc2, vv, W2;
+
+		ss = sqrt(A1*A1 + A2*A2 + A3*A3);
+
+		E1 = A1*u + A2*v + A3*w;
+		E2 = E1 - cc*ss;
+		E3 = E1 + cc*ss;
+
+        ss = 1.0/ss;
+
+        A1 *= ss;
+		A2 *= ss;
+		A3 *= ss;
+
+		E1P = (E1 + sqrt(E1 * E1 + epsl_sw_d * epsl_sw_d)) * 0.50;
+		E2P = (E2 + sqrt(E2 * E2 + epsl_sw_d * epsl_sw_d)) * 0.50;
+		E3P = (E3 + sqrt(E3 * E3 + epsl_sw_d * epsl_sw_d)) * 0.50;
+
+		E1M = E1 - E1P;
+		E2M = E2 - E2P;
+		E3M = E3 - E3P;
+		// ----------------------------------------
+
+		uc1 = u - cc * A1;
+		uc2 = u + cc * A1;
+		vc1 = v - cc * A2;
+		vc2 = v + cc * A2;
+		wc1 = w - cc * A3;
+		wc2 = w + cc * A3;
+		vvc1 = (uc1 * uc1 + vc1 * vc1 + wc1 * wc1) * 0.50;
+		vvc2 = (uc2 * uc2 + vc2 * vc2 + wc2 * wc2) * 0.50;
+		vv = (Gamma_d - 1.0) * (u * u + v * v + w * w );
+		W2 = split_C3 * cc * cc;
+
+		
+		get_SoA_LAP(fp, x, y, z, 0) = tmp0 * (split_C1 * E1P + E2P + E3P);
+		get_SoA_LAP(fp, x, y, z, 1) = tmp0 * (split_C1 * E1P * u + E2P * uc1 + E3P * uc2);
+		get_SoA_LAP(fp, x, y, z, 2) = tmp0 * (split_C1 * E1P * v + E2P * vc1 + E3P * vc2);
+		get_SoA_LAP(fp, x, y, z, 3) = tmp0 * (split_C1 * E1P * w + E2P * wc1 + E3P * wc2);
+		get_SoA_LAP(fp, x, y, z, 4) = tmp0 * (E1P * vv + E2P * vvc1 + E3P * vvc2 + W2 * (E2P + E3P));
+		// --------------------------------------------------------
+
+		get_SoA_LAP(fm, x, y, z, 0) = tmp0 * (split_C1 * E1M + E2M + E3M);
+		get_SoA_LAP(fm, x, y, z, 1) = tmp0 * (split_C1 * E1M * u + E2M * uc1 + E3M * uc2);
+		get_SoA_LAP(fm, x, y, z, 2) = tmp0 * (split_C1 * E1M * v + E2M * vc1 + E3M * vc2);
+		get_SoA_LAP(fm, x, y, z, 3) = tmp0 * (split_C1 * E1M * w + E2M * wc1 + E3M * wc2);
+		get_SoA_LAP(fm, x, y, z, 4) = tmp0 * (E1M * vv + E2M * vvc1 + E3M * vvc2 + W2 * (E2M + E3M));
+}
+
+
+__global__ void split_Jac3d_Stager_Warming_ker(sw_split sw, cudaSoA fp_x, cudaSoA fm_x, cudaSoA fp_y, cudaSoA fm_y, cudaSoA fp_z, cudaSoA fm_z, REAL tmp0, REAL split_C1, REAL split_C3, cudaJobPackage job)
+{
+	// eyes on cells WITH LAPs
+
+	unsigned int x = threadIdx.x + blockIdx.x*blockDim.x + job.start.x;
+	unsigned int y = threadIdx.y + blockIdx.y*blockDim.y + job.start.y;
+	unsigned int z = threadIdx.z + blockIdx.z*blockDim.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+        REAL A1, A2, A3;
+
+        REAL u, v, w, cc, d;
+
+        d = get_Field_LAP(sw.d, x, y, z);
+        u = get_Field_LAP(sw.u, x, y, z);
+        v = get_Field_LAP(sw.v, x, y, z);
+        w = get_Field_LAP(sw.w, x, y, z);
+        cc = get_Field_LAP(sw.cc, x, y, z);
+
+        tmp0 = d*tmp0;
+
+		A1 = get_Field_LAP(sw.Akx, x, y, z);
+		A2 = get_Field_LAP(sw.Aky, x, y, z);
+		A3 = get_Field_LAP(sw.Akz, x, y, z); 
+
+        Stager_Warming_ker(x, y, z, tmp0, u, v, w, cc, fp_x, fm_x, A1, A2, A3, split_C1, split_C3);
+
+        A1 = get_Field_LAP(sw.Aix, x, y, z);
+		A2 = get_Field_LAP(sw.Aiy, x, y, z);
+		A3 = get_Field_LAP(sw.Aiz, x, y, z); 
+
+        Stager_Warming_ker(x, y, z, tmp0, u, v, w, cc, fp_y, fm_y, A1, A2, A3, split_C1, split_C3);
+
+        A1 = get_Field_LAP(sw.Asx, x, y, z);
+		A2 = get_Field_LAP(sw.Asy, x, y, z);
+		A3 = get_Field_LAP(sw.Asz, x, y, z); 
+
+        Stager_Warming_ker(x, y, z, tmp0, u, v, w, cc, fp_z, fm_z, A1, A2, A3, split_C1, split_C3);
+	}
+}
+
+void Stager_Warming(cudaJobPackage job_in, cudaSoA *fp_x, cudaSoA *fm_x, cudaSoA *fp_y, cudaSoA *fm_y, cudaSoA *fp_z, cudaSoA *fm_z, cudaStream_t *stream){
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+	//cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x+2*LAP, size.y+2*LAP, size.z+2*LAP);
+	cal_grid_block_dim(&griddim, &blockdim, 8, 4, 4, size.x+2*LAP, size.y+2*LAP, size.z+2*LAP);
+
+	cudaJobPackage job( dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP), 
+	                    dim3(job_in.end.x + LAP, job_in.end.y + LAP, job_in.end.z + LAP) );
+
+    sw_split sw = {*pd_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, *pAix_d, *pAiy_d, *pAiz_d, *pAsx_d, *pAsy_d, *pAsz_d};
+	
+	CUDA_LAUNCH(( split_Jac3d_Stager_Warming_ker<<<griddim , blockdim, 0, *stream>>>(sw, *fp_x, *fm_x, *fp_y, *fm_y, *fp_z, *fm_z, tmp0, split_C1, split_C3, job) ));
+}
+
+/*
+__global__ void split_Jac3d_Stager_Warming_ker_out(sw_split_out sw, cudaSoA fp, cudaSoA fm, REAL tmp0, REAL split_C1, REAL split_C3, cudaJobPackage job)
+{
+	// eyes on cells WITH LAPs
+
+	unsigned int x = threadIdx.x + blockIdx.x*blockDim.x + job.start.x;
+	unsigned int y = threadIdx.y + blockIdx.y*blockDim.y + job.start.y;
+	unsigned int z = threadIdx.z + blockIdx.z*blockDim.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+        REAL A1, A2, A3;
+
+        REAL u, v, w, cc, d;
+
+        d = get_Field_LAP(sw.d, x, y, z);
+        u = get_Field_LAP(sw.u, x, y, z);
+        v = get_Field_LAP(sw.v, x, y, z);
+        w = get_Field_LAP(sw.w, x, y, z);
+        cc = get_Field_LAP(sw.cc, x, y, z);
+
+        tmp0 = d*tmp0;
+
+		A1 = get_Field_LAP(sw.Ax, x, y, z);
+		A2 = get_Field_LAP(sw.Ay, x, y, z);
+		A3 = get_Field_LAP(sw.Az, x, y, z); 
+
+        Stager_Warming_ker(x, y, z, tmp0, u, v, w, cc, fp, fm, A1, A2, A3, split_C1, split_C3);
+	}
+}
+
+void Stager_Warming_out(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, int flag, cudaStream_t *stream){
+	dim3 blockdim , griddim, size;
+
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x+2*LAP, size.y+2*LAP, size.z+2*LAP);
+    cudaJobPackage job( dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP), 
+	                    dim3(job_in.end.x + LAP, job_in.end.y + LAP, job_in.end.z + LAP) );
+    if(flag == 1){
+        sw_split_out sw = {*pd_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d};
+    }else if(flag == 2){
+        sw_split_out sw = {*pd_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d};
+    }else if(flag == 3){
+        sw_split_out sw = {*pd_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d};
+    }
+    	
+	CUDA_LAUNCH(( split_Jac3d_Stager_Warming_ker_out<<<griddim , blockdim, 0, *stream>>>(sw, *fp, *fm, tmp0, split_C1, split_C3, job) ));
+}*/
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_time.cu
+++ b/src/OCFD_time.cu
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_time.h"
+
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "parameters_d.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+__global__ void OCFD_time_advance_ker1(cudaSoA f , cudaSoA fn , cudaSoA du , cudaJobPackage job)
+{
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		get_SoA(f , x,y,z , 0) = get_SoA(fn , x,y,z , 0) + dt_d*get_SoA(du , x,y,z , 0);
+		get_SoA(f , x,y,z , 1) = get_SoA(fn , x,y,z , 1) + dt_d*get_SoA(du , x,y,z , 1);
+		get_SoA(f , x,y,z , 2) = get_SoA(fn , x,y,z , 2) + dt_d*get_SoA(du , x,y,z , 2);
+		get_SoA(f , x,y,z , 3) = get_SoA(fn , x,y,z , 3) + dt_d*get_SoA(du , x,y,z , 3);
+		get_SoA(f , x,y,z , 4) = get_SoA(fn , x,y,z , 4) + dt_d*get_SoA(du , x,y,z , 4);
+
+	}
+}
+
+__global__ void OCFD_time_advance_ker2(cudaSoA f , cudaSoA fn , cudaSoA du , cudaJobPackage job)
+{
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL tmp1 = 3.0 / 4.0;
+		REAL tmp2 = 1.0 / 4.0;
+
+		get_SoA(f , x,y,z , 0) = tmp1*get_SoA(fn , x,y,z , 0) + tmp2*( get_SoA(f , x,y,z , 0) + dt_d*get_SoA(du , x,y,z , 0) );
+		get_SoA(f , x,y,z , 1) = tmp1*get_SoA(fn , x,y,z , 1) + tmp2*( get_SoA(f , x,y,z , 1) + dt_d*get_SoA(du , x,y,z , 1) );
+		get_SoA(f , x,y,z , 2) = tmp1*get_SoA(fn , x,y,z , 2) + tmp2*( get_SoA(f , x,y,z , 2) + dt_d*get_SoA(du , x,y,z , 2) );
+		get_SoA(f , x,y,z , 3) = tmp1*get_SoA(fn , x,y,z , 3) + tmp2*( get_SoA(f , x,y,z , 3) + dt_d*get_SoA(du , x,y,z , 3) );
+		get_SoA(f , x,y,z , 4) = tmp1*get_SoA(fn , x,y,z , 4) + tmp2*( get_SoA(f , x,y,z , 4) + dt_d*get_SoA(du , x,y,z , 4) );
+	}
+}
+
+__global__ void OCFD_time_advance_ker3(cudaSoA f , cudaSoA fn , cudaSoA du , cudaSoA f_lap , cudaJobPackage job)
+{
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL tmp1 = 1.0 / 3.0;
+		REAL tmp2 = 2.0 / 3.0;
+
+		get_SoA_LAP(f_lap , x+LAP,y+LAP,z+LAP , 0) = get_SoA(f , x,y,z , 0) = tmp1*get_SoA(fn , x,y,z , 0) + tmp2*( get_SoA(f , x,y,z , 0) + dt_d*get_SoA(du , x,y,z , 0) );
+		get_SoA_LAP(f_lap , x+LAP,y+LAP,z+LAP , 1) = get_SoA(f , x,y,z , 1) = tmp1*get_SoA(fn , x,y,z , 1) + tmp2*( get_SoA(f , x,y,z , 1) + dt_d*get_SoA(du , x,y,z , 1) );
+		get_SoA_LAP(f_lap , x+LAP,y+LAP,z+LAP , 2) = get_SoA(f , x,y,z , 2) = tmp1*get_SoA(fn , x,y,z , 2) + tmp2*( get_SoA(f , x,y,z , 2) + dt_d*get_SoA(du , x,y,z , 2) );
+		get_SoA_LAP(f_lap , x+LAP,y+LAP,z+LAP , 3) = get_SoA(f , x,y,z , 3) = tmp1*get_SoA(fn , x,y,z , 3) + tmp2*( get_SoA(f , x,y,z , 3) + dt_d*get_SoA(du , x,y,z , 3) );
+		get_SoA_LAP(f_lap , x+LAP,y+LAP,z+LAP , 4) = get_SoA(f , x,y,z , 4) = tmp1*get_SoA(fn , x,y,z , 4) + tmp2*( get_SoA(f , x,y,z , 4) + dt_d*get_SoA(du , x,y,z , 4) );
+	}
+}
+
+void OCFD_time_advance(int KRK)
+{
+	dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx,ny,nz);
+    cudaJobPackage job( dim3(0,0,0) , dim3(nx,ny,nz) );
+
+	switch (KRK)
+	{
+		case 1:
+		{
+			CUDA_LAUNCH(( OCFD_time_advance_ker1<<<griddim , blockdim>>>(*pf_d , *pfn_d , *pdu_d , job) ));
+			break;
+		}
+		case 2:
+		{
+			CUDA_LAUNCH(( OCFD_time_advance_ker2<<<griddim , blockdim>>>(*pf_d , *pfn_d , *pdu_d , job) ));
+			break;
+		}
+		case 3:
+		{
+			CUDA_LAUNCH(( OCFD_time_advance_ker3<<<griddim , blockdim>>>(*pf_d , *pfn_d , *pdu_d , *pf_lap_d , job) ));
+			break;
+		}
+	}
+}
+
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
--- a/src/commen_kernel.cu
+++ b/src/commen_kernel.cu
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "parameters.h"
+#include "parameters_d.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+__global__ void cuda_mem_value_init(REAL value, REAL *ptr, unsigned int pitch, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (x < size_x && y < size_y && z < size_z)
+    {
+        *(ptr + x + pitch * (y + z * size_y)) = value;
+    }
+}
+void cuda_mem_value_init_warp(REAL value, REAL *ptr, unsigned int pitch, unsigned int size_x, unsigned int size_y, unsigned int size_z){
+    dim3 griddim ;
+    dim3 blockdim ;
+
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , size_x , size_y , size_z);
+    cuda_mem_value_init<<<griddim , blockdim>>>(value, ptr , pitch , size_x , size_y , size_z);
+}
+
+
+/* ========================= */
+__global__ void pri_to_cons_kernel(cudaSoA pcons , cudaField pd , cudaField pu , cudaField pv , cudaField pw , cudaField pT ,cudaJobPackage job ){
+    // eyes on cells WITHOUT LAPs
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x + job.start.x; 
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y + job.start.y; 
+    unsigned int z = blockIdx.z * blockDim.z + threadIdx.z + job.start.z;
+
+    REAL d,u,v,w,T;
+    if(x<job.end.x && y<job.end.y && z<job.end.z){
+        { 
+            // d
+            d = get_Field_LAP(pd , x+LAP,y+LAP,z+LAP);
+            get_SoA(pcons , x,y,z , 0) = d;
+        }
+        {
+            // u*d
+            u = get_Field_LAP(pu , x+LAP , y+LAP , z+LAP);
+            get_SoA(pcons , x,y,z , 1) = u*d;
+        }
+        {
+            // v*d
+            v = get_Field_LAP(pv , x+LAP , y+LAP , z+LAP);
+            get_SoA(pcons , x,y,z , 2) = v*d;
+        }
+        {
+            // w*d
+            w = get_Field_LAP(pw , x+LAP , y+LAP , z+LAP);
+            get_SoA(pcons , x,y,z , 3) = w*d ;
+        }
+        {
+            // E
+            T = get_Field_LAP(pT , x+LAP , y+LAP , z+LAP);
+            get_SoA(pcons , x,y,z , 4) = d*( (u*u + v*v + w*w)*0.5 + Cv_d * T);
+        }
+    }
+}
+void pri_to_cons_kernel_warp(cudaSoA *pcons , cudaField *pd , cudaField *pu , cudaField *pv , cudaField *pw , cudaField *pT , cudaJobPackage job_in , dim3 blockdim_in ){
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , job_in.end.x , job_in.end.y , job_in.end.z);
+    cudaJobPackage job;
+    CUDA_LAUNCH(( pri_to_cons_kernel<<<griddim , blockdim>>>(*pcons , *pd , *pu , *pv , *pw , *pT , job_in) ))
+}
+
+
+/* ========================= */
+
+__global__ void cons_to_pri_kernel(cudaSoA f, cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField P , cudaJobPackage job){
+    // eyes on no-lap region
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL d1,u1,v1,w1,T1,d2,T2;
+        
+        d1 = get_SoA(f, x, y, z, 0);
+        get_Field_LAP(d, x+LAP, y+LAP, z+LAP) = d2 = d1;
+
+        u1 = get_SoA(f, x, y, z, 1);
+        u1 = u1/d1;
+        get_Field_LAP(u, x+LAP, y+LAP, z+LAP) = u1;
+
+        v1 = get_SoA(f, x, y, z, 2);
+        v1 = v1/d1;
+        get_Field_LAP(v, x+LAP, y+LAP, z+LAP) = v1;
+
+        w1 = get_SoA(f, x, y, z, 3);
+        w1 = w1/d1;
+        get_Field_LAP(w, x+LAP, y+LAP, z+LAP) = w1;
+
+        T1 = get_SoA(f, x, y, z, 4);
+        get_Field_LAP(T, x+LAP, y+LAP, z+LAP) = T2 = (T1 - 0.5*d1*(u1*u1 + v1*v1 + w1*w1))/(d1*Cv_d);
+        // T1 = T1/(d1*Cv_d);
+        //T1 = T1 - 0.5*(u1*u1 + v1*v1 + w1*w1)/d1;
+        // get_Field_LAP(P, x+LAP, y+LAP, z+LAP) = T1*(Gamma_d - 1.0);
+        // get_Field_LAP(T , x+LAP , y+LAP , z+LAP) = T1/Cv_d;
+        get_Field_LAP(P, x+LAP, y+LAP, z+LAP) = T2*d2/(Gamma_d*Ama_d*Ama_d);
+    }
+}
+
+void get_duvwT()
+{
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx,ny,nz);
+    cudaJobPackage job(dim3(0,0,0) , dim3(nx,ny,nz));
+
+    CUDA_LAUNCH(( cons_to_pri_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pP_d , job) ))
+}
+
+// -----Computation of viscousity ---------------------------------------------
+
+__global__ void get_Amu_kernal(cudaField Amu , cudaField T , cudaJobPackage job){
+    // eyes on field without LAP
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL tmp = get_Field_LAP(T , x+LAP , y+LAP , z+LAP);
+        get_Field(Amu , x,y,z) = amu_C0_d * sqrt(tmp * tmp * tmp) / (Tsb_d + tmp);
+    }
+}
+void get_Amu()
+{
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx,ny,nz);
+    cudaJobPackage job(dim3(0,0,0) , dim3(nx,ny,nz));
+
+    CUDA_LAUNCH(( get_Amu_kernal<<<griddim , blockdim>>>(*pAmu_d , *pT_d , job) ))
+}
+
+/* ======================================================== */
+
+__global__ void sound_speed_kernel(cudaField T , cudaField cc , cudaJobPackage job){
+    // eyes on no-lap region
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+        get_Field_LAP(cc , x,y,z) = sqrt( get_Field_LAP(T , x,y,z) )/Ama_d;
+    }
+}
+
+/* ============================================================================== */
+
+// out += xf
+__global__ void YF_Pe_XF(cudaField yF , cudaField xF , cudaField AJac , cudaJobPackage job){
+    // WITHOUT LAPs
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+	    REAL ajac;
+	    ajac = get_Field_LAP(AJac, x+LAP, y+LAP, z+LAP);
+        atomicAdd(yF.ptr + (x + yF.pitch*(y + ny_d*z)), ajac * get_Field(xF, x, y, z));
+    }
+}
+
+// out = xf+yf
+__global__ void ZF_e_XF_P_YF(cudaField out , cudaField xF , cudaField yF , cudaJobPackage job){
+    // WITHOUT LAPs
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+        get_Field(out , x,y,z) = get_Field(xF , x,y,z) + get_Field(yF , x,y,z) ;
+    }
+}
+__global__ void ZF_e_XF_P_YF_LAP(cudaField out , cudaField xF , cudaField yF , cudaJobPackage job){
+    // WITH LAPs
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+        get_Field_LAP(out , x,y,z) = get_Field_LAP(xF , x,y,z) + get_Field_LAP(yF , x,y,z) ;
+    }
+}
+
+// zf += xf+yf
+__global__ void ZF_Pe_XF_P_YF(cudaField zF , cudaField xF , cudaField yF , cudaField AJac , cudaJobPackage job){
+    // WITHOUT LAPs
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+	    REAL ajac;
+	    ajac = get_Field_LAP(AJac , x+LAP , y+LAP , z+LAP);
+        get_Field(zF , x,y,z) += - ajac * ( get_Field(xF , x,y,z) + get_Field(yF , x,y,z) ) ;
+    }
+}
+
+
+//__device__ void ZF_Pe_XF_P_YF_Device(cudaField zF, cudaField xF, cudaField yF, cudaField AJac){
+//    // WITHOUT LAPs
+//	REAL ajac;
+//	ajac = get_Field_LAP(AJac , x+LAP , y+LAP , z+LAP);
+//        get_Field(zF , x,y,z) += - ajac * ( get_Field(xF , x,y,z) + get_Field(yF , x,y,z) ) ;
+//}
+#ifdef __cplusplus
+}
+#endif
--- a/src/cuda_commen.c
+++ b/src/cuda_commen.c
+#include "cuda_commen.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+uint32_t BlockDimX = 8;
+uint32_t BlockDimY = 8;
+uint32_t BlockDimZ = 8;
+
+dim3 BlockDim_X = {8, 8, 4};
+dim3 BlockDim_Y = {8, 8, 4};
+dim3 BlockDim_Z = {8, 8, 4};
+
+dim3 BlockDim = {8, 8, 4};
+
+int MaxThreadsPerBlock;
+int MaxBlockDimX;
+int MaxBlockDimY;
+int MaxBlockDimZ;
+int MaxGridDimX;
+int MaxGridDimY;
+int MaxGridDimZ;
+
+
+void cuda_commen_init(){
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxThreadsPerBlock , cudaDevAttrMaxThreadsPerBlock , 0));
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxBlockDimX , cudaDevAttrMaxBlockDimX , 0));
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxBlockDimY , cudaDevAttrMaxBlockDimY , 0));
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxBlockDimZ , cudaDevAttrMaxBlockDimZ , 0));
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxGridDimX , cudaDevAttrMaxGridDimX , 0));
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxGridDimY , cudaDevAttrMaxGridDimY , 0));
+    CUDA_CALL(cudaDeviceGetAttribute(&MaxGridDimZ , cudaDevAttrMaxGridDimZ , 0));
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/cuda_utility.c
+++ b/src/cuda_utility.c
+#include "cuda_runtime.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "parameters.h"
+#include "utility.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if defined(__HIPCC__)
+typedef struct hipExtent Extent_t;
+typedef struct hipPos Pos_t;
+typedef struct hipPitchedPtr PitchedPtr_t;
+typedef struct hipMemcpy3DParms Memcpy3DParms_t;
+#define make_Extent(a,b,c) make_hipExtent(a,b,c)
+#define make_Pos(a,b,c) make_hipPos(a,b,c)
+#define make_PitchedPtr(a,b,c,d) make_hipPitchedPtr(a,b,c,d)
+#define Memcpy3D(parm) memcpy3D_me(parm)
+
+#else
+typedef struct cudaExtent Extent_t;
+typedef struct cudaPos Pos_t;
+typedef struct cudaPitchedPtr PitchedPtr_t;
+typedef struct cudaMemcpy3DParms Memcpy3DParms_t;
+#define make_Extent(a,b,c) make_cudaExtent(a,b,c)
+#define make_Pos(a,b,c) make_cudaPos(a,b,c)
+#define make_PitchedPtr(a,b,c,d) make_cudaPitchedPtr(a,b,c,d)
+#define Memcpy3D(parm) cudaMemcpy3D(parm)
+#endif
+
+void *malloc_me_d(unsigned int *pitch, unsigned int size_x, unsigned size_y, unsigned size_z)
+{
+    PitchedPtr_t ptr;
+    Extent_t extent = make_Extent(size_x * sizeof(REAL), size_y, size_z);
+    #if defined(__HIPCC__)
+    CUDA_CALL( hipMallocPitch(&(ptr.ptr), &(ptr.pitch) , extent.width , extent.height*extent.depth) );
+    #else
+    CUDA_CALL( cudaMalloc3D(&ptr, extent) );
+    #endif
+
+    *pitch = ptr.pitch;
+    return ptr.ptr;
+}
+
+void new_cudaField(cudaField **pField, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    cudaField tmpField;
+    unsigned int pitch;
+    void *tmp_ptr;
+
+    tmp_ptr = malloc_me_d(&pitch, size_x , size_y, size_z);
+
+    tmpField.ptr = (REAL *)tmp_ptr;
+    pitch /= sizeof(REAL);
+    tmpField.pitch = pitch;
+
+    *pField = (cudaField *)malloc(sizeof(cudaField));
+    **pField = tmpField;
+}
+
+void *malloc_me_int_d(unsigned int *pitch, unsigned int size_x, unsigned size_y, unsigned size_z)
+{
+    PitchedPtr_t ptr;
+    Extent_t extent = make_Extent(size_x * sizeof(int), size_y, size_z);
+    #if defined(__HIPCC__)
+    CUDA_CALL( hipMallocPitch(&(ptr.ptr), &(ptr.pitch) , extent.width , extent.height*extent.depth) );
+    #else
+    CUDA_CALL( cudaMalloc3D(&ptr, extent) );
+    #endif
+
+    *pitch = ptr.pitch;
+    return ptr.ptr;
+}
+
+void new_cudaField_int(cudaField_int **pField, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    cudaField_int tmpField;
+    unsigned int pitch;
+    void *tmp_ptr;
+
+    tmp_ptr = malloc_me_int_d(&pitch, size_x , size_y, size_z);
+
+    tmpField.ptr = (int *)tmp_ptr;
+    pitch /= sizeof(int);
+    tmpField.pitch = pitch;
+
+    *pField = (cudaField_int *)malloc(sizeof(cudaField_int));
+    **pField = tmpField;
+}
+
+void delete_cudaField(cudaField *pField)
+{
+    CUDA_CALL( cudaFree(pField->ptr) );
+    free(pField);
+    pField = 0;
+}
+
+void delete_cudaField_int(cudaField_int *pField)
+{
+    CUDA_CALL( cudaFree(pField->ptr) );
+    free(pField);
+    pField = 0;
+}
+
+void new_cudaSoA(cudaSoA **pSoA, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    cudaSoA tmpSoA;
+    unsigned int pitch;
+    void *tmp_ptr;
+
+    tmp_ptr = malloc_me_d(&pitch, size_x, size_y, size_z * NVARS);
+
+    tmpSoA.ptr = (REAL *)tmp_ptr;
+
+    pitch /= sizeof(REAL);
+    tmpSoA.pitch = pitch;
+    tmpSoA.length_Y = size_y;
+    tmpSoA.length_Z = size_z;
+
+    *pSoA = (cudaSoA *)malloc(sizeof(cudaSoA));
+    **pSoA = tmpSoA;
+}
+
+
+void delete_cudaSoA(cudaSoA *pSoA)
+{
+    CUDA_CALL( cudaFree(pSoA->ptr) );
+    free(pSoA);
+    pSoA = 0;
+}
+
+void new_cudaFieldPack(cudaFieldPack ** pack , unsigned int size_x , unsigned int size_y , unsigned int size_z){
+    int size = size_x*size_y*size_z*sizeof(REAL);
+    *pack = (cudaFieldPack*)malloc(sizeof(cudaFieldPack));
+
+    REAL * ptr;
+    CUDA_CALL(( cudaMalloc((void**)&ptr , size) ))
+    (*pack) -> ptr = ptr;
+}
+
+void delete_cudaFieldPack(cudaFieldPack * pack){
+    CUDA_CALL(( cudaFree(pack->ptr) ))
+    free(pack);
+}
+
+/*
+
+{
+	struct cudaExtent extent = make_cudaExtent(nx*sizeof(REAL),ny,nz);
+    struct cudaPitchedPtr from = make_cudaPitchedPtr(SoA.ptr0 , SoA.pitch , nx , ny);
+    struct cudaPitchedPtr to = make_cudaPitchedPtr(buffer , nx*sizeof(REAL) , nx , ny);
+
+    struct cudaMemcpy3DParms parm = {0};
+    parm.extent = extent;
+    parm.srcPtr = from;
+    parm.dstPtr = to;
+    parm.kind = cudaMemcpyDeviceToHost;
+	cudaMemcpy3D(&parm);
+}
+
+*/
+
+// ================================================================== //
+void cal_grid_block_dim(dim3 *pgrid, dim3 *pblock, unsigned int threadx, unsigned int thready, unsigned int threadz, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    pblock->x = threadx;//block的纬度
+    pblock->y = thready;
+    pblock->z = threadz;
+
+    pgrid->x = (size_x + threadx - 1) / threadx;//三个方向启动block的数目
+    pgrid->y = (size_y + thready - 1) / thready;
+    pgrid->z = (size_z + threadz - 1) / threadz;
+}
+
+
+#include "assert.h"
+#include "mpi.h"
+#include "stdio.h"
+cudaError_t memcpy3D_me(Memcpy3DParms_t * p){
+    assert( p->srcPos.x >=0 );
+    assert( p->srcPos.y >=0 );
+    assert( p->srcPos.z >=0 );
+
+    assert( p->srcPos.x + p->extent.width  - 1 < p->srcPtr.xsize );
+    assert( p->srcPos.y + p->extent.height - 1 < p->srcPtr.ysize );
+
+    assert( p->dstPos.x >=0 );
+    assert( p->dstPos.y >=0 );
+    assert( p->dstPos.z >=0 );
+
+    assert( p->dstPos.x + p->extent.width  - 1 < p->dstPtr.xsize );
+    assert( p->dstPos.y + p->extent.height - 1 < p->dstPtr.ysize );
+
+    int src_size = p->srcPtr.pitch;
+    int dst_size = p->dstPtr.pitch;
+    char * src = (char*)p->srcPtr.ptr + p->srcPos.x + p->srcPtr.pitch*(p->srcPos.y + p->srcPtr.ysize*p->srcPos.z);
+    char * dst = (char*)p->dstPtr.ptr + p->dstPos.x + p->dstPtr.pitch*(p->dstPos.y + p->dstPtr.ysize*p->dstPos.z);
+
+    cudaError_t error;
+    int offset_src , offset_dst;
+    for(int k=0;k<p->extent.depth;k++){
+        offset_src = k*p->srcPtr.ysize;
+        offset_dst = k*p->dstPtr.ysize;
+        for(int j=0;j<p->extent.height;j++){
+            error = cudaMemcpy(dst + p->dstPtr.pitch*(offset_dst+j) , src + p->srcPtr.pitch*(offset_src+j) , p->extent.width , p->kind);
+            if(error != cudaSuccess){
+                printf("error in memcpy3d_me , ( k = %d , j = %d)\n",k,j);
+                printf("In file \"%s\" , line %d ( \"%s\" ) , cuda call failed\n",__FILE__ , __LINE__ , __FUNCTION__);
+                printf("Error code : %s  (%s)\n",cudaGetErrorString(error) , cudaGetErrorName(error));
+                MPI_Abort(MPI_COMM_WORLD , 1);
+            }
+        }
+    }
+    return cudaSuccess;
+}
+
+void memcpy_All_int(int *hostPtr, int *devPtr, unsigned int pitch, int mode, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{   
+    Extent_t extent = make_Extent(size_x * sizeof(int), size_y, size_z);
+    PitchedPtr_t host = make_PitchedPtr(hostPtr, size_x * sizeof(int), size_x*sizeof(int), size_y);
+    PitchedPtr_t dev = make_PitchedPtr(devPtr, pitch*sizeof(int), size_x*sizeof(int), size_y);
+
+    Memcpy3DParms_t parm = {0};
+
+    parm.extent = extent;
+
+    if (mode == H2D)
+    {
+        parm.srcPtr = host;
+        parm.dstPtr = dev;
+        parm.kind = cudaMemcpyHostToDevice;
+    }
+    else
+    {
+        parm.srcPtr = dev;
+        parm.dstPtr = host;
+        parm.kind = cudaMemcpyDeviceToHost;
+    }
+    CUDA_CALL( Memcpy3D(&parm) );
+}
+
+void memcpy_All(REAL *hostPtr, REAL *devPtr, unsigned int pitch, int mode, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{   
+    Extent_t extent = make_Extent(size_x * sizeof(REAL), size_y, size_z);
+    PitchedPtr_t host = make_PitchedPtr(hostPtr, size_x * sizeof(REAL), size_x*sizeof(REAL), size_y);
+    PitchedPtr_t dev = make_PitchedPtr(devPtr, pitch*sizeof(REAL), size_x*sizeof(REAL), size_y);
+
+    Memcpy3DParms_t parm = {0};
+
+    parm.extent = extent;
+
+    if (mode == H2D)
+    {
+        parm.srcPtr = host;
+        parm.dstPtr = dev;
+        parm.kind = cudaMemcpyHostToDevice;
+    }
+    else
+    {
+        parm.srcPtr = dev;
+        parm.dstPtr = host;
+        parm.kind = cudaMemcpyDeviceToHost;
+    }
+    CUDA_CALL( Memcpy3D(&parm) );
+}
+
+void memcpy_inner(REAL *hostPtr, REAL *devPtr, unsigned int pitch, int mode, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    // data block is size_x * size_y * size_z
+
+    Extent_t extent = make_Extent( (size_x-2*LAP )* sizeof(REAL), size_y-2*LAP, size_z-2*LAP);
+    PitchedPtr_t host = make_PitchedPtr(hostPtr, size_x * sizeof(REAL), size_x*sizeof(REAL), size_y);
+    PitchedPtr_t dev = make_PitchedPtr(devPtr, pitch*sizeof(REAL), size_x*sizeof(REAL), size_y);
+    Pos_t pos = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+    Memcpy3DParms_t parm = {0};
+
+    parm.extent = extent;
+
+    if (mode == H2D)
+    {
+        parm.srcPtr = host;
+        parm.srcPos = pos;
+        parm.dstPtr = dev;
+        parm.dstPos = pos;
+        parm.kind = cudaMemcpyHostToDevice;
+    }
+    else
+    {
+        parm.srcPtr = dev;
+        parm.srcPos = pos;
+        parm.dstPtr = host;
+        parm.dstPos = pos;
+        parm.kind = cudaMemcpyDeviceToHost;
+    }
+    CUDA_CALL( Memcpy3D(&parm) );
+}
+
+void memcpy_bound_x(REAL *hostPtr, REAL *devPtr, unsigned int pitch, int mode, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    Extent_t extent = make_Extent( LAP * sizeof(REAL), size_y-2*LAP, size_z-2*LAP);
+    PitchedPtr_t host = make_PitchedPtr(hostPtr, size_x * sizeof(REAL), size_x*sizeof(REAL), size_y);
+    PitchedPtr_t dev = make_PitchedPtr(devPtr, pitch*sizeof(REAL), size_x*sizeof(REAL), size_y);
+
+    Memcpy3DParms_t parm_m = {0};
+    Memcpy3DParms_t parm_p = {0};
+
+    // 1 - src , 2 - dst
+    Pos_t pos_1;
+    Pos_t pos_2;
+
+    if (mode == H2D)
+    {
+        // 1 - src , 2 - dst
+        pos_1 = make_Pos(0 , LAP , LAP);
+        pos_2 = make_Pos(0 , LAP , LAP);
+        parm_m.srcPtr = host;
+        parm_m.srcPos = pos_1;
+        parm_m.dstPtr = dev;
+        parm_m.dstPos = pos_2;
+        parm_m.extent = extent;
+        parm_m.kind = cudaMemcpyHostToDevice;
+
+        pos_1 = make_Pos(sizeof(REAL)*(size_x-  LAP) , LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*(size_x-  LAP) , LAP , LAP);
+        parm_p.srcPtr = host;
+        parm_p.srcPos = pos_1;
+        parm_p.dstPtr = dev;
+        parm_p.dstPos = pos_2;
+        parm_p.extent = extent;
+        parm_p.kind = cudaMemcpyHostToDevice;
+    }
+    else
+    {
+        pos_1 = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+        parm_m.srcPtr = dev;
+        parm_m.srcPos = pos_1;
+        parm_m.dstPtr = host;
+        parm_m.dstPos = pos_2;
+        parm_m.extent = extent;
+        parm_m.kind = cudaMemcpyDeviceToHost;
+
+
+        pos_1 = make_Pos(sizeof(REAL)*(size_x-2*LAP) , LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*(size_x-2*LAP) , LAP , LAP);
+        parm_p.srcPtr = dev;
+        parm_p.srcPos = pos_1;
+        parm_p.dstPtr = host;
+        parm_p.dstPos = pos_2;
+        parm_p.extent = extent;
+        parm_p.kind = cudaMemcpyDeviceToHost;
+    }
+    CUDA_CALL( Memcpy3D(&parm_m) );
+    CUDA_CALL( Memcpy3D(&parm_p) );
+}
+
+void memcpy_bound_y(REAL *hostPtr, REAL *devPtr, unsigned int pitch, int mode, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    Extent_t extent = make_Extent(  (size_x - 2*LAP) * sizeof(REAL), LAP, size_z-2*LAP);
+    PitchedPtr_t host = make_PitchedPtr(hostPtr, size_x * sizeof(REAL), size_x*sizeof(REAL), size_y);
+    PitchedPtr_t dev = make_PitchedPtr(devPtr, pitch*sizeof(REAL), size_x*sizeof(REAL), size_y);
+
+    Memcpy3DParms_t parm_m = {0};
+    Memcpy3DParms_t parm_p = {0};
+
+    Pos_t pos_1;
+    Pos_t pos_2;
+    if (mode == H2D)
+    {
+        pos_1 = make_Pos(sizeof(REAL)*LAP , 0 , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , 0 , LAP);
+        parm_m.srcPtr = host;
+        parm_m.srcPos = pos_1;
+        parm_m.dstPtr = dev;
+        parm_m.dstPos = pos_2;
+        parm_m.extent = extent;
+        parm_m.kind = cudaMemcpyHostToDevice;
+
+        pos_1 = make_Pos(sizeof(REAL)*LAP , size_y-LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , size_y-LAP , LAP);
+        parm_p.srcPtr = host;
+        parm_p.srcPos = pos_1;
+        parm_p.dstPtr = dev;
+        parm_p.dstPos = pos_2;
+        parm_p.extent = extent;
+        parm_p.kind = cudaMemcpyHostToDevice;
+    }
+    else
+    {
+        pos_1 = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+        parm_m.srcPtr = dev;
+        parm_m.srcPos = pos_1;
+        parm_m.dstPtr = host;
+        parm_m.dstPos = pos_2;
+        parm_m.extent = extent;
+        parm_m.kind = cudaMemcpyDeviceToHost;
+
+        pos_1 = make_Pos(sizeof(REAL)*LAP , size_y-2*LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , size_y-2*LAP , LAP);
+        parm_p.srcPtr = dev;
+        parm_p.srcPos = pos_1;
+        parm_p.dstPtr = host;
+        parm_p.dstPos = pos_2;
+        parm_p.extent = extent;
+        parm_p.kind = cudaMemcpyDeviceToHost;
+    }
+    CUDA_CALL( Memcpy3D(&parm_m) );
+    CUDA_CALL( Memcpy3D(&parm_p) );
+}
+
+void memcpy_bound_z(REAL *hostPtr, REAL *devPtr, unsigned int pitch, int mode, unsigned int size_x, unsigned int size_y, unsigned int size_z)
+{
+    Extent_t extent = make_Extent(  (size_x - 2*LAP) * sizeof(REAL), size_y - 2*LAP, LAP);
+    PitchedPtr_t host = make_PitchedPtr(hostPtr, size_x * sizeof(REAL), size_x*sizeof(REAL), size_y);
+    PitchedPtr_t dev = make_PitchedPtr(devPtr, pitch*sizeof(REAL), size_x*sizeof(REAL), size_y);
+
+    Memcpy3DParms_t parm_m = {0};
+    Memcpy3DParms_t parm_p = {0};
+
+    Pos_t pos_1;
+    Pos_t pos_2;
+    if (mode == H2D)
+    {
+        pos_1 = make_Pos(sizeof(REAL)*LAP , LAP , 0);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , LAP , 0);
+        parm_m.srcPtr = host;
+        parm_m.srcPos = pos_1;
+        parm_m.dstPtr = dev;
+        parm_m.dstPos = pos_2;
+        parm_m.extent = extent;
+        parm_m.kind = cudaMemcpyHostToDevice;
+
+        pos_1 = make_Pos(sizeof(REAL)*LAP , LAP , size_z-LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , LAP , size_z-LAP);
+        parm_p.srcPtr = host;
+        parm_p.srcPos = pos_1;
+        parm_p.dstPtr = dev;
+        parm_p.dstPos = pos_2;
+        parm_p.extent = extent;
+        parm_p.kind = cudaMemcpyHostToDevice;
+    }
+    else
+    {
+        pos_1 = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , LAP , LAP);
+        parm_m.srcPtr = dev;
+        parm_m.srcPos = pos_1;
+        parm_m.dstPtr = host;
+        parm_m.dstPos = pos_2;
+        parm_m.extent = extent;
+        parm_m.kind = cudaMemcpyDeviceToHost;
+
+        pos_1 = make_Pos(sizeof(REAL)*LAP , LAP , size_z-2*LAP);
+        pos_2 = make_Pos(sizeof(REAL)*LAP , LAP , size_z-2*LAP);
+        parm_p.srcPtr = dev;
+        parm_p.srcPos = pos_1;
+        parm_p.dstPtr = host;
+        parm_p.dstPos = pos_2;
+        parm_p.extent = extent;
+        parm_p.kind = cudaMemcpyDeviceToHost;
+    }
+    CUDA_CALL( Memcpy3D(&parm_m) );
+    CUDA_CALL( Memcpy3D(&parm_p) );
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/parameters.c
+++ b/src/parameters.c
+#ifndef __PARAMETER_H
+#define __PARAMETER_H
+#include "mpi.h"
+#include "pthread.h"
+#include "config_parameters.h"
+#include "OCFD_Schemes_hybrid_auto.h"
+#include "OCFD_Schemes_Choose.h"
+#include "utility.h"
+#include "stdio.h"
+#include "string.h"
+#include "assert.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+// ------For Doubleprecision  (real*8)------------------------------------------------------------------
+int OCFD_REAL_KIND=8;
+MPI_Datatype OCFD_DATA_TYPE = MPI_DOUBLE;   //double precison computing
+//  ------For Single precision (real*4)-----------------------
+// typedef float REAL;
+// int OCFD_REAL_KIND=4,  
+// MPI_Datatype OCFD_DATA_TYPE = MPI_REAL;             //  single precision computing
+// ===========  Parameters for MPI ==========================================================----------------
+
+
+
+// -----constant-----
+REAL Re , Pr , Ama , Gamma , Ref_T , epsl_SW , PI = 3.141592653589793;
+REAL Cv , Cp , Tsb , amu_C0;
+REAL split_C1 , split_C3, tmp0;
+REAL vis_flux_init_c;
+
+
+// --------MPI-------------------
+int my_id,npx,npy,npz;  //全局即方向id
+int NPX0=0 , NPY0=0 , NPZ0=0; // proc number on each direction
+int ID_XP1,ID_XM1,ID_YP1,ID_YM1,ID_ZP1,ID_ZM1; //邻居全局id
+
+MPI_Status status;
+MPI_Comm MPI_COMM_X,MPI_COMM_Y,MPI_COMM_Z,MPI_COMM_XY,MPI_COMM_XZ,MPI_COMM_YZ;
+MPI_Datatype TYPE_LAPX1,TYPE_LAPY1,TYPE_LAPZ1,TYPE_LAPX2,TYPE_LAPY2,TYPE_LAPZ2;
+
+int *i_offset,*j_offset,*k_offset,*i_nn,*j_nn,*k_nn; //某个方向的分块信息
+int MSG_BLOCK_SIZE;
+
+unsigned int nx=0,ny=0,nz=0; // 某方向所处理的个数
+unsigned int NX_GLOBAL=0,NY_GLOBAL=0,NZ_GLOBAL=0;
+unsigned int nx_lap,ny_lap,nz_lap;
+unsigned int nx_2lap,ny_2lap,nz_2lap;
+
+int Stream_MODE; //Stream 模式
+int TEST;
+pthread_t* thread_handles;
+// --------------------------------------------------------------------------------------------------------
+REAL dt,end_time,tt;
+REAL cpu_time;
+int Istep , end_step;
+
+// -----------Analysis and Save------------------------------------------
+
+int OCFD_ANA_time_average; 
+int OCFD_ana_flatplate,  OCFD_ana_saveplaneYZ;
+int OCFD_ana_saveplaneXZ, OCFD_ana_savePoints;
+int OCFD_ana_saveplaneXY, OCFD_ana_saveblock;
+int OCFD_ana_getQ;
+
+int Kstep_save, Kstep_show,N_ana,*K_ana,*Kstep_ana,KRK;
+
+//------------Scheme_choose invis and vis----------------------------------------
+int Scheme_invis_ID = 0;
+int Scheme_vis_ID = 0;
+
+//---------------------WENO_SYMBO_Limiter------------------------------------------
+REAL WENO_TV_Limiter = 5.0;
+REAL WENO_TV_MAX = 0.2;
+
+// -----------Boundary Condition and Initial Condition-----------------------------
+int Iperiodic[3], Jacbound[3], D0_bound[6];
+int Non_ref[6];
+
+int Init_stat;
+int IBC_USER;
+
+
+REAL N_nan;
+REAL *BC_rpara, (*ANA_rpara)[100]; 
+int *BC_npara, (*ANA_npara)[100]; 
+
+
+// ------Filter----------------------------------------------- 
+int FILTER_1,FILTER_2;   // 1: 5-point filter, 2: 7-point wide-band filter
+int NF_max = 10;   // maximum of filtering 
+int Filter_Fo9p=1, Filter_Fopt_shock=2;
+int NFiltering,(*Filter_para)[11];      //Filtering
+REAL (*Filter_rpara)[3];   // s0  (0<s0<1), rth
+char IF_Filter_X = 0 , IF_Filter_Y  = 0, IF_Filter_Z = 0 ;
+int fiter_judge_X = 0, fiter_judge_Y = 0, fiter_judge_Z = 0;
+// --------------------------------------------------
+
+
+
+// Coordinate parameters 
+REAL hx,hy,hz;
+REAL *pAxx,*pAyy,*pAzz,*pAkx,*pAky,*pAkz,*pAix,*pAiy,*pAiz,*pAsx,*pAsy,*pAsz,*pAjac;
+
+
+// calculate memory
+REAL *pAmu; // viscous 3d [nz][nt][nx]
+REAL *pd,*pu,*pv,*pw,*pT,*pP; //  [nz+2*LAP][ny+2*LAP][nx+2*LAP]
+REAL *pf,*pfn,*pdu; // [5][nz][ny][nx]
+
+// used in filtering
+REAL *pf_lap; // [nz+2*LAP][ny+2*LAP][nx+2*LAP][5]
+
+// used in invis jacobian , is part of ptmpa
+REAL *pfp; // [5][nz+2*LAP][ny+2*LAP][nx+2*LAP]
+REAL *pfm; // [5][nz+2*LAP][ny+2*LAP][nx+2*LAP]
+REAL *pcc; // [nz+2*LAP][ny+2*LAP][nx+2*LAP]
+// used in invis jacobian , is part of ptmpb
+REAL *pdfp , *pdfm; // [nz][ny][nx]
+
+// used in ana
+REAL *pQ , *pLamda2;// [nx][ny][nx]
+REAL *pdm, *pum, *pvm, *pwm, *pTm;//[nx+2*LAP][ny+2*LAP][nz+2*LAP]
+int average_IO = 1;
+int Istep_average;
+REAL tt_average;
+
+// used in vis jacobian , is part of ptmpb
+REAL * pEv1,*pEv2,*pEv3,*pEv4;  // [nz+2*LAP][ny+2*LAP][nx+2*LAP]
+// used in vis jacobian , is part of ptmpb
+REAL *puk,*pui,*pus,*pvk,*pvi,*pvs,*pwk,*pwi,*pws,*pTk,*pTi,*pTs;  //[nz][ny][nx]
+// used in mecpy
+REAL *pack_send_x,* pack_recv_x;
+REAL *pack_send_y,* pack_recv_y;
+REAL *pack_send_z,* pack_recv_z;
+
+// used in boudary_liftbody*********************************************************************
+
+int OCFD_BC_Liftbody3d;
+int IF_SYMMETRY = 0;
+int IF_WITHLEADING;  // 0 不含头部， 1 含头部 
+int IFLAG_UPPERBOUNDARY; // 0 激波外； 1 激波
+REAL AOA,TW,EPSL_WALL,EPSL_UPPER,WALL_DIS_BEGIN,WALL_DIS_END;
+REAL Sin_AOA , Cos_AOA;
+
+
+// used in boundary_compressible_conner***********************************************************
+int MZMAX, MTMAX, INLET_BOUNDARY, IFLAG_WALL_NOT_NORMAL;
+REAL EPSL, X_DIST_BEGIN, X_DIST_END, BETA;
+REAL X_WALL_BEGIN, X_UP_BOUNDARY_BEGIN;
+
+
+// used in SCHEME_HYBRIDAUTO ********************************************************************
+int IFLAG_HybridAuto = 0;
+int HybridA_Stage = 3, Patch_max = 10;
+int IFLAG_mem = 1;
+HybridAuto_TYPE HybridAuto;
+int *scheme_x, *scheme_y, *scheme_z;
+
+int IF_CHARTERIC;
+
+
+
+configItem configList[27] = {
+    {"GRID_3D", 0},                 //0
+    {"PARALLEL_3D", 0},             //1
+    {"LAP", 0},                     //2
+    {"MSG_BLOCK_SIZE", 0},          //3      
+    {"STREAM", 0},                  //4
+    {"TEST", 0},                    //5
+    {"IPERIODIC", 0},               //6
+    {"JAC_BOUND", 0},               //7
+    {"DIF_BOUND", 0},               //8
+    {"NON_REFLETION", 0},           //9
+    {"SCHEME_INVIS", 0},            //10
+    {"SCHEME_VIS", 0},              //11
+    {"RE", 0},                      //12
+    {"AMA", 0},                     //13
+    {"GAMMA", 0},                   //14
+    {"PR", 0},                      //15
+    {"T_REF", 0},                   //16
+    {"EPSL_SW", 0},                 //17
+    {"DT", 0},                      //18
+    {"END_TIME", 0},                //19
+    {"KSTEP_SHOW", 0},              //20
+    {"KSTEP_SAVE", 0},              //21
+    {"INIT_STAT", 0},               //22
+    {"IBC", 0},                     //23
+    {"BC_NPARA", 0},                //24
+    {"BC_RPARA", 0},                //25
+    {"CHARTERIC", 0}                //26   
+};
+
+
+void read_parameters(){
+//-------------------------------------------
+    int dummy_i, tmp;
+    REAL dummy_r;
+    int configNum = sizeof(configList)/sizeof(configItem);
+    char Scheme_invis[50], Scheme_vis[50];
+    char Part_buff[50][1000];
+
+    if(my_id == 0){
+        int nk,nr;
+        FILE * file = fopen("opencfd-scu.in","r");
+
+        if(file == NULL){
+            printf("\033[31mopencfd-scu.in is not find!\033[0m\n");
+            exit(-1);
+        }
+
+        SearchItem(file, configList, configNum);
+        
+        sscanf(configList[0].value,"%d%d%d",&NX_GLOBAL,&NY_GLOBAL,&NZ_GLOBAL);
+        sscanf(configList[1].value,"%d%d%d",&NPX0,&NPY0,&NPZ0);
+        sscanf(configList[2].value,"%d",&dummy_i);
+        sscanf(configList[3].value,"%d",&MSG_BLOCK_SIZE);
+        sscanf(configList[4].value,"%d",&Stream_MODE);
+        sscanf(configList[5].value,"%d",&TEST);
+
+        sscanf(configList[6].value,"%d%d%d",&Iperiodic[0],&Iperiodic[1],&Iperiodic[2]);
+        sscanf(configList[7].value,"%d%d%d",&Jacbound[0],&Jacbound[1],&Jacbound[2]);
+        sscanf(configList[8].value,"%d%d%d%d%d%d",&D0_bound[0],&D0_bound[1],&D0_bound[2],&D0_bound[3],&D0_bound[4],&D0_bound[5]);
+        sscanf(configList[9].value,"%d%d%d%d%d%d",&Non_ref[0],&Non_ref[1],&Non_ref[2],&Non_ref[3],&Non_ref[4],&Non_ref[5]);
+
+        sscanf(configList[10].value,"%s", Scheme_invis);
+        sscanf(configList[11].value,"%s", Scheme_vis);
+        SCHEME_CHOOSE scheme = {Scheme_invis, Scheme_vis};
+        Schemes_Choose_ID(&scheme);
+
+        if(strcmp(Scheme_invis, "SCHEME_HYBRIDAUTO") == 0) IFLAG_HybridAuto = 1;
+
+        HybridAuto.Num_Patch_zones = 0;
+        HybridAuto.IF_Smooth_dp = 0;
+
+        HybridAuto.P_intvs = (REAL *)malloc((HybridA_Stage - 1)*sizeof(REAL));
+        HybridAuto.zones = (int *)malloc(6*Patch_max*sizeof(int));
+        HybridAuto.Pa_zones = (REAL *)malloc(Patch_max*sizeof(REAL));
+
+        if(IFLAG_HybridAuto == 1){
+
+            int (*HybridAuto_zones)[6] = (int(*)[6])HybridAuto.zones;
+
+            configItem Hybridbuff = {"HY_DP_INTV", 0};
+            SearchItem(file, &Hybridbuff, 1);
+
+            tmp = PartItem(Hybridbuff.value, Part_buff);
+            for(int i=0;i<(HybridA_Stage-1);i++) sscanf(Part_buff[i],"%lf",&HybridAuto.P_intvs[i]);
+
+            sprintf(Hybridbuff.name, "HY_STYLE");
+            SearchItem(file, &Hybridbuff, 1);
+            sscanf(Hybridbuff.value,"%d",&HybridAuto.Style);
+
+            if(HybridAuto.Style != 1 && HybridAuto.Style != 2){
+                printf("\033[31mHYBRID SCHEMES CHOOSE IS WRONG！！！\033[0m\n");
+                exit(0);
+            }
+
+            sprintf(Hybridbuff.name, "HY_SMOOTH_DP");
+            SearchItem(file, &Hybridbuff, 1);
+            sscanf(Hybridbuff.value,"%d",&HybridAuto.IF_Smooth_dp);
+
+            sprintf(Hybridbuff.name, "HY_PATCH_ZONE");
+            SearchItem(file, &Hybridbuff, 1);
+            sscanf(Hybridbuff.value,"%d",&HybridAuto.Num_Patch_zones);
+
+            for(int i=0; i<HybridAuto.Num_Patch_zones; i++){
+                sprintf(Hybridbuff.name, "HY_ZONE%d", i);
+                SearchItem(file, &Hybridbuff, 1);
+
+                sscanf(Hybridbuff.value,"%d%d%d%d%d%d%lf",&HybridAuto_zones[i][0],&HybridAuto_zones[i][1],&HybridAuto_zones[i][2],
+                &HybridAuto_zones[i][3],&HybridAuto_zones[i][4],&HybridAuto_zones[i][5],&HybridAuto.Pa_zones[i]);
+            }
+        }
+
+        sscanf(configList[12].value,"%lf",&Re);
+        sscanf(configList[13].value,"%lf",&Ama);
+        sscanf(configList[14].value,"%lf",&Gamma);
+        sscanf(configList[15].value,"%lf",&Pr);
+        sscanf(configList[16].value,"%lf",&Ref_T);
+        sscanf(configList[17].value,"%lf",&epsl_SW);
+
+        sscanf(configList[18].value,"%lf",&dt);
+        sscanf(configList[19].value,"%lf",&end_time);
+        sscanf(configList[20].value,"%d",&Kstep_show);
+        sscanf(configList[21].value,"%d",&Kstep_save);
+        sscanf(configList[22].value,"%d",&Init_stat);
+
+        sscanf(configList[23].value,"%d",&IBC_USER);
+        BC_npara = (int*)malloc(sizeof(int)*100);
+        BC_rpara = (REAL*)malloc(sizeof(REAL)*100);
+
+        nk = PartItem(configList[24].value, Part_buff);
+        for(int i=0;i<nk;i++) sscanf(Part_buff[i],"%d",BC_npara+i);
+        
+        nr = PartItem(configList[25].value, Part_buff);
+        for(int i=0;i<nr;i++) sscanf(Part_buff[i],"%lf",BC_rpara+i);
+
+        sscanf(configList[26].value,"%d", &IF_CHARTERIC);
+
+        int NameNUM[1000];
+
+        NFiltering = ItemNUM(file, "FILTER_NPARA", &NameNUM[0]);
+
+        Filter_para = (int(*)[11])malloc(sizeof(int)*(NFiltering+1)*11);
+        Filter_rpara = (REAL(*)[3])malloc(sizeof(REAL)*(NFiltering+1)*3);
+
+        for(int i=0;i<NFiltering;i++){
+            configItem Hybridbuff;
+            //ntime, Filter_X, Filter_Y, Filter_Z, ib, ie, jb, je, kb, ke, Filter_scheme
+            sprintf(Hybridbuff.name, "FILTER_NPARA%d", NameNUM[i]);
+            SearchItem(file, &Hybridbuff, 1);
+
+            tmp = PartItem(Hybridbuff.value, Part_buff);
+            for(int n=0;n<11;n++) sscanf(Part_buff[n],"%d",&Filter_para[i][n]);
+            for(int n=0;n<11;n++) Filter_para[i+1][n] = Filter_para[i][n];
+
+            sprintf(Hybridbuff.name, "FILTER_RPARA%d", NameNUM[i]);
+            SearchItem(file, &Hybridbuff, 1);
+
+            tmp = PartItem(Hybridbuff.value, Part_buff);
+            for(int n=0;n<3;n++) sscanf(Part_buff[n],"%lf",&Filter_rpara[i][n]);
+            for(int n=0;n<3;n++) Filter_rpara[i+1][n] = Filter_rpara[i][n];
+        }
+
+        N_ana = ItemNUM(file, "ANA_EVENT", &NameNUM[0]);
+
+        ANA_npara = (int(*)[100])malloc(sizeof(int)*100*N_ana);
+        ANA_rpara = (REAL(*)[100])malloc(sizeof(REAL)*100*N_ana);
+        K_ana = (int*)malloc(sizeof(int)*N_ana);
+        Kstep_ana = (int*)malloc(sizeof(int)*N_ana);
+
+        for(int i=0;i<N_ana;i++){
+            configItem Hybridbuff;
+            sprintf(Hybridbuff.name, "ANA_EVENT%d", NameNUM[i]);
+            SearchItem(file, &Hybridbuff, 1);
+
+            sscanf(Hybridbuff.value,"%d%d",K_ana+i,Kstep_ana+i);
+
+            sprintf(Hybridbuff.name, "ANA_NPARA%d", NameNUM[i]);
+            SearchItem(file, &Hybridbuff, 1);
+
+            nk = PartItem(Hybridbuff.value, Part_buff);
+            for(int n=0;n<nk;n++) sscanf(Part_buff[n],"%d",&ANA_npara[i][n]);
+
+            sprintf(Hybridbuff.name, "ANA_RPARA%d", NameNUM[i]);
+            SearchItem(file, &Hybridbuff, 1);
+
+            nr = PartItem(Hybridbuff.value, Part_buff);
+            for(int n=0;n<nr;n++) sscanf(Part_buff[n],"%lf",&ANA_rpara[i][n]);
+        }
+
+        fclose(file);
+
+    } else {
+        int nk = 100;
+        int nr = 100;
+        
+        N_ana = 10;
+        
+        BC_npara = (int*)malloc(sizeof(int)*nk);
+        BC_rpara = (REAL*)malloc(sizeof(REAL)*nr);
+
+        Filter_para = (int(*)[11])malloc(sizeof(int)*NF_max*11);
+        Filter_rpara = (REAL(*)[3])malloc(sizeof(REAL)*NF_max*3);
+
+        ANA_npara = (int(*)[100])malloc(sizeof(int)*100*N_ana);
+        ANA_rpara = (REAL(*)[100])malloc(sizeof(REAL)*100*N_ana);
+        K_ana = (int*)malloc(sizeof(int)*N_ana);
+        Kstep_ana = (int*)malloc(sizeof(int)*N_ana);
+
+        HybridAuto.P_intvs = (REAL *)malloc((HybridA_Stage - 1)*sizeof(REAL));
+        HybridAuto.zones = (int *)malloc(6*Patch_max*sizeof(int));
+        HybridAuto.Pa_zones = (REAL *)malloc(Patch_max*sizeof(REAL));
+    }
+
+    int btmp[18];
+    if(my_id == 0){
+        btmp[0]=Jacbound[0];
+        btmp[1]=Jacbound[1];
+        btmp[2]=Jacbound[2];
+
+        btmp[3]=D0_bound[0];
+        btmp[4]=D0_bound[1];
+        btmp[5]=D0_bound[2];
+        btmp[6]=D0_bound[3];
+        btmp[7]=D0_bound[4];
+        btmp[8]=D0_bound[5];
+
+        btmp[9] =Iperiodic[0];
+        btmp[10]=Iperiodic[1];
+        btmp[11]=Iperiodic[2];
+
+        for(int i = 0; i < 6; i++){
+            btmp[i+12] = Non_ref[i];
+        }
+    }
+
+       MPI_Bcast(btmp, 18, MPI_INT, 0, MPI_COMM_WORLD);
+
+    if(my_id!=0){
+        Jacbound[0]=btmp[0];
+        Jacbound[1]=btmp[1];
+        Jacbound[2]=btmp[2];
+
+        D0_bound[0]=btmp[3];
+        D0_bound[1]=btmp[4];
+        D0_bound[2]=btmp[5];
+        D0_bound[3]=btmp[6];
+        D0_bound[4]=btmp[7];
+        D0_bound[5]=btmp[8];
+
+        Iperiodic[0]=btmp[9];
+        Iperiodic[1]=btmp[10];
+        Iperiodic[2]=btmp[11];
+
+        for(int i = 0; i < 6; i++){
+            Non_ref[i] = btmp[i+12];
+        }
+    }
+   
+//     Boardcast integer and real parameters to all proc    
+    int ntmp[19];
+    if(my_id == 0){
+        ntmp[0]=NX_GLOBAL;
+        ntmp[1]=NY_GLOBAL;
+        ntmp[2]=NZ_GLOBAL;
+        ntmp[3]=NPX0;
+        ntmp[4]=NPY0;
+        ntmp[5]=NPZ0;
+        ntmp[6]=LAP;
+        ntmp[7]=MSG_BLOCK_SIZE;               // defined in opencfd.h, as in common /ocfd_mpi_comm/  
+        ntmp[8]=Kstep_show;
+        ntmp[9]=IBC_USER;
+        ntmp[10]=N_ana;
+        ntmp[11]=Kstep_save;
+        ntmp[12]=Init_stat;
+        ntmp[13]=NFiltering;
+        ntmp[14]=Scheme_invis_ID;
+        ntmp[15]=Scheme_vis_ID;
+        ntmp[16]=Stream_MODE;
+        ntmp[17]=TEST;
+        ntmp[18]=IF_CHARTERIC;
+    }
+
+	MPI_Bcast(ntmp, 19, MPI_INT, 0, MPI_COMM_WORLD);
+
+    if(my_id!=0){
+        NX_GLOBAL = ntmp[0];
+        NY_GLOBAL = ntmp[1];
+        NZ_GLOBAL = ntmp[2];
+        NPX0 = ntmp[3];
+        NPY0 = ntmp[4];
+        NPZ0 = ntmp[5];
+        dummy_i = ntmp[6];
+        MSG_BLOCK_SIZE = ntmp[7];                 // defined in opencfd.h, as in common /ocfd_mpi_comm/  
+        Kstep_show = ntmp[8];
+        IBC_USER = ntmp[9];
+        N_ana = ntmp[10];
+        Kstep_save = ntmp[11];
+        Init_stat = ntmp[12];
+        NFiltering = ntmp[13];
+        Scheme_invis_ID = ntmp[14];
+        Scheme_vis_ID = ntmp[15];
+        Stream_MODE = ntmp[16];
+        TEST = ntmp[17];
+        IF_CHARTERIC = ntmp[18];
+    }
+
+//c----------------------------------------------------
+    REAL rtmp[8];
+    if(my_id==0){
+        rtmp[0]=Re;
+        rtmp[1]=Ama;
+        rtmp[2]=Gamma;
+        rtmp[3]=Pr;
+        rtmp[4]=dt;
+        rtmp[5]=end_time;
+        rtmp[6]=Ref_T;
+        rtmp[7]=epsl_SW;            // epsl in Steger-Warming splitting
+    }
+
+    MPI_Bcast(rtmp , 8 , OCFD_DATA_TYPE , 0 , MPI_COMM_WORLD);
+    
+    if(my_id!=0){
+        Re = rtmp[0];
+        Ama = rtmp[1];
+        Gamma = rtmp[2];
+        Pr = rtmp[3];
+        dt = rtmp[4];
+        end_time = rtmp[5];
+        Ref_T = rtmp[6];
+        epsl_SW = rtmp[7];
+    }
+    if(Ref_T <= 0) Ref_T=288.150;
+
+	dummy_i=5;
+
+    int htmp[3];
+
+    if(my_id == 0){
+        htmp[0] = IFLAG_HybridAuto;
+        htmp[1] = HybridAuto.Num_Patch_zones;
+        htmp[2] = HybridAuto.IF_Smooth_dp;
+        htmp[3] = HybridAuto.Style;
+    }
+
+    MPI_Bcast(htmp, 4, MPI_INT, 0, MPI_COMM_WORLD);
+
+    if(my_id != 0){
+        IFLAG_HybridAuto = htmp[0];
+        HybridAuto.Num_Patch_zones = htmp[1];
+        HybridAuto.IF_Smooth_dp = htmp[2];
+        HybridAuto.Style = htmp[3];
+    }
+
+    MPI_Bcast(HybridAuto.P_intvs, HybridA_Stage - 1, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+    MPI_Bcast(HybridAuto.zones, 6*Patch_max, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(HybridAuto.Pa_zones, Patch_max, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+//c----------------------------------------------------------------------------
+
+    MPI_Bcast(K_ana, N_ana, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(Kstep_ana, N_ana, MPI_INT, 0, MPI_COMM_WORLD);
+    
+    MPI_Bcast(BC_npara, 100, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(BC_rpara, 100, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+
+    MPI_Bcast(ANA_npara, N_ana*100, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(ANA_rpara, N_ana*100, OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+
+    MPI_Bcast(Filter_para, 11*(NFiltering+1), MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(Filter_rpara, 3*(NFiltering+1), OCFD_DATA_TYPE, 0, MPI_COMM_WORLD);
+
+//---------------------------------------------------------------------------------
+//   print the parameters
+  if(my_id == 0){
+    printf("##################################################################################################\n");
+    printf("Welcome to use OpenCFD-SCU-V1.00!\nCopyRight by Li-Xinliang, LHD, Institute of Mechanics, CAS (lixl@imech.ac.cn)\n");
+    printf("Coded by Liu-Shiwei, ICMSEC, Academy of Mathematics and Systems Science, CAS (liusw@lsec.cc.ac.cn)\n");
+    printf("Coded by Dang-Guanlin, LHD, Institute of Mechanics, CAS (dangguanlin@imech.ac.cn) 2020-01\n");
+    printf("Mesh(Nx,Ny,Nz): (%d,%d,%d)\n" , NX_GLOBAL, NY_GLOBAL, NZ_GLOBAL);
+    printf("3D Partation: %d*%d*%d   Total procs=%d\n", NPX0,NPY0,NPZ0 , NPX0*NPY0*NPZ0);
+    printf("Re=%f , Ma=%f , Gamma=%f , dt=%f\n", Re, Ama, Gamma, dt);
+    if(IFLAG_HybridAuto == 1) printf("Hybrid Scheme enabled, Hybrid style is %d\n", HybridAuto.Style);
+    printf("Start Computing ......\n");
+ 
+    FILE * file;
+    file = fopen("opencfd.log","a"); 
+    fprintf(file,"##################################################################################################\n");
+    fprintf(file,"OpenCFD-SCU-V1.00 CopyRight by Li-Xinliang, LHD, Institute of Mechanics, CAS (lixl@imech.ac.cn)\n");
+    fprintf(file,"Coded by Liu-Shiwei, ICMSEC, Academy of Mathematics and Systems Science, CAS (liusw@lsec.cc.ac.cn)\n");
+    fprintf(file,"Coded by Dang-Guanlin, LHD, Institute of Mechanics, CAS (dangguanlin@imech.ac.cn) 2020-01\n");
+    fprintf(file,"Mesh(Nx,Ny,Nz): (%d,%d,%d)\n" , NX_GLOBAL, NY_GLOBAL, NZ_GLOBAL);
+    fprintf(file,"3D Partation: %d*%d*%d   Total procs=%d\n", NPX0,NPY0,NPZ0 , NPX0*NPY0*NPZ0);
+    fprintf(file,"Re=%f , Ma=%f , Gamma=%f , dt=%f\n", Re, Ama, Gamma, dt);
+    if(IFLAG_HybridAuto == 1) fprintf(file, "Hybrid Scheme enabled, Hybrid style is %d\n", HybridAuto.Style);
+    fprintf(file,"Start Computing ......\n");
+
+    fclose(file);
+  }
+//-------------------------------------------------------
+}
+
+
+void* vis_choose_CD6(void* Scheme_vis){
+    char *Scheme_VIS = (char *)Scheme_vis;
+    if(strcmp(Scheme_VIS, "CD6") == 0) Scheme_vis_ID = 203;
+}
+
+void* vis_choose_CD8(void* Scheme_vis){
+    char *Scheme_VIS = (char *)Scheme_vis;
+    if(strcmp(Scheme_VIS, "CD8") == 0) Scheme_vis_ID = 204;
+}
+//----------------------------------------------------------------------
+void* invis_choose_up7(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "UP7") == 0) Scheme_invis_ID = 301;
+}
+
+void* invis_choose_weno5(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "WENO5") == 0) Scheme_invis_ID = 302;
+}
+
+void* invis_choose_weno7(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "WENO7") == 0) Scheme_invis_ID = 303;
+}
+
+void* invis_choose_weno7_symbo(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "WENO7_SYMBO") == 0) Scheme_invis_ID = 304;
+}
+
+void* invis_choose_weno7_symbo_limit(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "WENO7_SYMBO_LIM") == 0) Scheme_invis_ID = 305;
+}
+
+void* invis_choose_NND2(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "NND2") == 0) Scheme_invis_ID = 306;
+}
+
+void* invis_choose_OMP6_HR(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "OMP6_HR") == 0) Scheme_invis_ID = 307;  //for OMP6, High-robust
+}
+
+void* invis_choose_OMP6_LD(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "OMP6_LD") == 0) Scheme_invis_ID = 308;  //for OMP6, Low-dissipation
+}
+
+void* invis_choose_OMP6_CD8(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "OMP6_CD8") == 0) Scheme_invis_ID = 309;  //for OMP6, 8th-Center
+}
+
+void* invis_choose_SCHEME_HYBRIDAUTO(void* Scheme_invis){
+    char *Scheme_INVIS = (char *)Scheme_invis;
+    if(strcmp(Scheme_INVIS, "SCHEME_HYBRIDAUTO") == 0) Scheme_invis_ID = 310;
+}
+
+
+void Schemes_Choose_ID(SCHEME_CHOOSE *scheme){
+
+    pthread_create(&thread_handles[0], NULL, invis_choose_weno7_symbo, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[1], NULL, invis_choose_weno7_symbo_limit, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[2], NULL, invis_choose_weno5, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[3], NULL, invis_choose_weno7, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[4], NULL, invis_choose_NND2, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[5], NULL, invis_choose_OMP6_HR, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[6], NULL, invis_choose_OMP6_LD, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[7], NULL, invis_choose_OMP6_CD8, (void*)(*scheme).invis);
+    pthread_create(&thread_handles[8], NULL, invis_choose_up7, (void*)(*scheme).invis);
+
+    pthread_create(&thread_handles[9], NULL, invis_choose_SCHEME_HYBRIDAUTO, (void*)(*scheme).invis);
+
+
+    pthread_create(&thread_handles[10], NULL, vis_choose_CD6, (void*)(*scheme).vis);
+    pthread_create(&thread_handles[11], NULL, vis_choose_CD8, (void*)(*scheme).vis);
+
+    for(long thread = 0; thread < 12; thread++)
+        pthread_join(thread_handles[thread], NULL);
+
+    if(Scheme_invis_ID == 0 || Scheme_vis_ID == 0){
+        printf("\033[31mSCHEMES CHOOSE IS WRONG！！！\033[0m\n");
+        exit(0);
+    }
+}
+
+int ExtarctItem(char *src, char *name, char *value){
+    char *eq, *lf;
+    eq = strchr(src, '=');
+    lf = strchr(src, '\n');
+
+    if(eq != NULL && lf != NULL){
+        *lf = '\0';
+        strncpy(name, src, eq-src);
+        strcpy(value, eq+1);
+        return 1;
+    }
+
+    return 0;
+}
+
+
+void ModifyItem(char *name, char *buff){
+    while(*name != '\0')
+    {
+        if(*name != ' '){
+            *buff = *name;
+            buff++;
+        }
+        name++;
+    }
+}
+
+void RemovalNUM(char *buff){
+    int i, j;
+
+    for(i=j=0; buff[i]!='\0'; i++){
+        if(buff[i]<'0' || buff[i]>'9')
+            buff[j++] = buff[i];
+    }
+
+    buff[j] = '\0';
+}
+
+int StringToInteger(char *buff){
+    int value = 0;
+
+    while(*buff != '\0')
+    {
+        if(*buff>='0' && *buff<='9')
+            value = value*10 + *buff - '0';
+
+        buff++;
+    }
+    
+    return value;
+}
+
+
+void SearchItem(FILE *file, configItem *List, int configNum){
+    int N = 1000;
+    char buff[N];
+    char name[N];
+    char value[N];
+
+    rewind(file);
+
+    while(fgets(buff, N, file))
+    {
+        if(ExtarctItem(buff, name, value)){
+            memset(buff, 0, strlen(buff));
+            ModifyItem(name, buff);
+            for(int i = 0; i < configNum; i++){
+                if(strcmp(buff, List[i].name) == 0){
+                    strcpy(List[i].value, value);
+                }
+            }
+            memset(name, 0, strlen(name));
+        }
+    }
+}
+
+
+int ItemNUM(FILE *file, char *Item_name, int *NameNUM){
+    int N = 1000;
+    int i = 0;
+    char buff[N];
+    char name[N];
+    char value[N];
+
+    rewind(file);
+
+    while(fgets(buff, N, file))
+    {
+        if(ExtarctItem(buff, name, value)){
+            memset(buff, 0, strlen(buff));
+            ModifyItem(name, buff);
+
+            RemovalNUM(buff);
+
+            if(strcmp(buff, Item_name) == 0){
+                i += 1;
+                *NameNUM = StringToInteger(name);
+                NameNUM++;
+            }
+
+            memset(name, 0, strlen(name));
+        }
+    }
+
+    return i;
+}
+
+
+int PartItem(char *src, char part[][1000]){
+    const char blank[2] = " ";
+    int num = 0;
+
+    char *buff;
+
+    buff = strtok(src, blank);
+
+    while (buff != NULL)
+    {
+        strcpy(part[num], buff);
+        buff = strtok(NULL, blank);
+
+        num += 1;
+    }
+
+    return num;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+
+
--- a/src/parameters_d.cu
+++ b/src/parameters_d.cu
+#include "config_parameters.h"
+/* 
+GPU端所需参数与常数
+ */
+
+#include "cuda_runtime.h"
+#include "cuda_commen.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+cudaStream_t Stream[15];
+cudaEvent_t  Event[15];
+
+// -----constant-----
+__device__ __constant__ REAL Ama_d , Gamma_d , epsl_sw_d;
+__device__ __constant__  REAL Cv_d , Cp_d , Tsb_d , amu_C0_d;
+
+__device__ __constant__ REAL split_C1_d , split_C3_d;
+__device__ __constant__ REAL vis_flux_init_c_d;
+
+__device__ __constant__ unsigned int nx_d,ny_d,nz_d; // 某方向所处理的个数
+__device__ __constant__ unsigned int nx_lap_d,ny_lap_d,nz_lap_d;
+__device__ __constant__ unsigned int nx_2lap_d,ny_2lap_d,nz_2lap_d;
+__device__ __constant__ REAL dt_d;
+__device__ __constant__ REAL Sin_AOA_d , Cos_AOA_d;
+__device__ __constant__ REAL TW_d;
+
+//---------------------WENO_SYMBO_Limiter------------------------------------------
+__device__ __constant__ REAL WENO_TV_Limiter_d;
+__device__ __constant__ REAL WENO_TV_MAX_d;
+
+// ----------------------------------------------------------
+// Coordinate parameters 
+__device__ __constant__ REAL hx_d,hy_d,hz_d;  //参考空间网格尺寸
+cudaField *pAxx_d,*pAyy_d,*pAzz_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d;  // 度量系数矩阵
+
+
+// calculate memory
+cudaField *pAmu_d; // viscous 3d [nz][ny][nx]
+cudaField *pd_d,*pu_d,*pv_d,*pw_d,*pT_d,*pP_d; //  [nz+2*LAP][ny+2*LAP][nx+2*LAP]
+cudaSoA *pf_d,*pfn_d,*pdu_d; // [5][nz][ny][nx]
+
+// used in filtering
+cudaSoA *pf_lap_d; // [nz+2*LAP][ny+2*LAP][nx+2*LAP][5]
+
+// used in analysis
+cudaField *pdm_d, *pum_d, *pvm_d, *pwm_d, *pTm_d;
+
+// used in invis jacobian , is part of ptmpa
+cudaSoA *pfp_x_d; // [5][nz-2*LAP][ny-2*LAP][nx-2*LAP]
+cudaSoA *pfm_x_d; // [5][nz-2*LAP][ny-2*LAP][nx-2*LAP]
+
+cudaSoA *pfp_y_d; // [5][nz-2*LAP][ny-2*LAP][nx-2*LAP]
+cudaSoA *pfm_y_d; // [5][nz-2*LAP][ny-2*LAP][nx-2*LAP]
+
+cudaSoA *pfp_z_d; // [5][nz-2*LAP][ny-2*LAP][nx-2*LAP]
+cudaSoA *pfm_z_d; // [5][nz-2*LAP][ny-2*LAP][nx-2*LAP]
+
+cudaField *pcc_d; // [nz+2*LAP][ny+2*LAP][nx+2*LAP]
+// used in invis jacobian , is part of ptmpb
+cudaField *pdfp_d , *pdfm_d; // [nz][ny][nx]
+
+
+// used in vis jacobian , is part of ptmpb
+cudaField * pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d;  // [nz+2*LAP][ny+2*LAP][nx+2*LAP]
+// used in vis jacobian , is part of ptmpb
+cudaField *puk_d,*pui_d,*pus_d,*pvk_d,*pvi_d,*pvs_d,*pwk_d,*pwi_d,*pws_d,*pTk_d,*pTi_d,*pTs_d;  //[nz][ny][nx]
+cudaField *vis_u_d,*vis_v_d,*vis_w_d,*vis_T_d;  //[nz][ny][nx]
+
+// used in boundary_liftbody***************************************************
+
+
+// used in boundary_compressible_conner****************************************
+cudaField *pub1_d, *pfx_d, *pgz_d;
+
+cudaField grad_P;
+cudaField *pPP_d;
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/test.c
+++ b/src/test.c
+#include "test.h"
+#include "parameters.h"
+
+#include "stdio.h"
+void write_block_me(char * name , REAL * u , int nx , int ny , int nz)    //[nx][ny][nz][n]
+{
+    REAL (*U)[nz][ny][nx] = (REAL(*)[nz][ny][nx])u;
+    FILE * file = fopen(name , "w");
+for(int n=0;n<5;n++){
+    for(int k=0;k<nz;k++){
+        for(int j=0;j<ny;j++){
+            for(int i=0;i<nx;i++){
+                fprintf(file , "i=%3d,j=%3d,k=%2d,n=%1d\t%32.10lf\n",i+1,j+1,k+1,n+1,U[n][k][j][i]);
+            }
+        }
+    }
+}
+    fclose(file);
+}
+
+void write_block_me1(char * name , REAL * u , int nx , int ny , int nz)   //[nx][ny][nz]
+{
+    REAL (*U)[ny+2*LAP][nx+2*LAP] = (REAL(*)[ny+2*LAP][nx+2*LAP])u;
+    FILE * file = fopen(name , "w");
+    for(int k=0;k<nz+2*LAP;k++){
+        for(int j=0;j<ny+2*LAP;j++){
+            for(int i=0;i<nx+2*LAP;i++){
+                fprintf(file , "i=%3d,j=%3d,k=%2d\t%32.10lf\n",i-LAP+1,j-LAP+1,k-LAP+1,U[k][j][i]);
+            }
+        }
+    }
+    fclose(file);
+}
+
+void write_block_me2(char * name , REAL * u , int nx , int ny , int nz)    //[nx+LAP][ny+LAP][nz+LAP][n]
+{
+    REAL (*U)[nz+2*LAP][ny+2*LAP][nx+2*LAP] = (REAL(*)[nz+2*LAP][ny+2*LAP][nx+2*LAP])u;
+    FILE * file = fopen(name , "w");
+for(int n=0;n<5;n++){
+    for(int k=0;k<nz+2*LAP;k++){
+        for(int j=0;j<ny+2*LAP;j++){
+            for(int i=0;i<nx+2*LAP;i++){
+                fprintf(file , "i=%3d,j=%3d,k=%2d,n=%1d\t%32.10lf\n",i-LAP+1,j-LAP+1,k-LAP+1,n+1,U[n][k][j][i]);
+            }
+        }
+    }
+}
+    fclose(file);
+}
+
+void write_block_me3(char * name , REAL * u , int nx , int ny , int nz)    //[nx][ny][nz]
+{
+    REAL (*U)[ny][nx] = (REAL(*)[ny][nx])u;
+    FILE * file = fopen(name , "w");
+    for(int k=0;k<nz;k++){
+        for(int j=0;j<ny;j++){
+            for(int i=0;i<nx;i++){
+                fprintf(file , "i=%3d,j=%3d,k=%2d\t%32.10lf\n",i+1,j+1,k+1,U[k][j][i]);
+            }
+        }
+    }
+    fclose(file);
+}
+
--- a/src/utility.cu
+++ b/src/utility.cu
+#include "utility.h"
+#include "stdio.h"
+#include "cuda_commen.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+void malloc_me_Host_(void **p, int size , const char * funname , const char * file , int line){
+
+	cudaError_t Status = cudaHostAlloc(p, size, cudaHostAllocDefault);
+    if(Status != cudaSuccess){
+       printf("Memory allocate error ! Can not allocate enough momory in fun %s ( file %s  , line %d ) , Proc %d\n" , funname ,file,line,my_id);
+       MPI_Finalize();
+       exit(EXIT_FAILURE);
+    }
+
+}
+
+void * malloc_me_(int size , const char * funname , const char * file , int line){
+    void * tmp = malloc(size);
+    if(tmp == NULL){
+       printf("Memory allocate error ! Can not allocate enough momory in fun %s ( file %s  , line %d ) , Proc %d\n" , funname ,file,line,my_id);
+       MPI_Finalize();
+       exit(EXIT_FAILURE);
+    }
+    return tmp;
+}
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
--- a/src_hip/OCFD_Comput_Jacobian3d.cpp
+++ b/src_hip/OCFD_Comput_Jacobian3d.cpp
+#include "hip/hip_runtime.h"
+/*--------- This code runs only at the initial times -----------------------
+读入计算网格 (Axx, Ayy, Azz),  计算Jocaiban 系数；  
+该程序仅在初始化阶段运行 
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_Comput_Jacobian3d.h"
+#include "OCFD_Schemes_Choose.h"
+#include "OCFD_mpi.h"
+#include "OCFD_IO.h"
+
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "OCFD_mpi_dev.h"
+#include "commen_kernel.h"
+#include "math.h"
+#include "OCFD_ana.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+void Init_Jacobian3d()
+{
+    //    init with unit
+
+    cuda_mem_value_init_warp(1.0 , pAjac_d->ptr , pAjac_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAxx_d->ptr , pAxx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAyy_d->ptr , pAyy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAzz_d->ptr , pAzz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAkx_d->ptr , pAkx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAky_d->ptr , pAky_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAkz_d->ptr , pAkz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAix_d->ptr , pAix_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAiy_d->ptr , pAiy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAiz_d->ptr , pAiz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAsx_d->ptr , pAsx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAsy_d->ptr , pAsy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAsz_d->ptr , pAsz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    {
+        REAL * tmp;
+        int tmp_size = (nx+2*LAP)*(ny+2*LAP)*(nz+2*LAP);
+        tmp = pAxx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAyy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAzz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAkx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAky ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAkz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAix ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAiy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAiz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAsx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAsy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAsz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAjac ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+    }
+    // -------------------------------------------------------------------------
+    char filename1[100];
+    MPI_File tmp_file;
+    sprintf(filename1, "OCFD3d-Jacobi.dat");
+
+    if(Init_stat == 0){
+        int i,j,k;
+        int klap , jlap,ilap;
+        int i_off , j_off , k_off;
+        int i_real, j_real, k_real;
+        REAL r , d_r;
+        REAL theta , d_theta , theta_0;
+
+        REAL r0 = 1.0;
+        REAL dr = 1.0;
+        d_theta = PI / NY_GLOBAL;
+        theta_0 = -PI*0.5;
+        d_r = dr / NZ_GLOBAL;
+
+        i_off = i_offset[npx];
+        j_off = j_offset[npy];
+        k_off = k_offset[npz];
+        for(k = 0;k<nz;k++){
+            klap = k+LAP;
+            k_real = k + k_off;
+            r = r0 + d_r * k_real;
+            for(j=0;j<ny;j++){
+                jlap = j+LAP;
+                j_real = j + j_off;
+                theta = theta_0 + d_theta * j_real;
+                for(i=0;i<nx;i++){
+                    ilap = i+LAP;
+                    i_real = i + i_off;
+                    *(pAxx + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = i_real * hx;
+
+                    *(pAyy + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r * cos(theta);
+                    *(pAzz + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r * sin(theta);
+                }
+            }
+        }
+        if(npy == NPY0 - 1){
+            jlap = ny - 1 + LAP;
+            for(k = 0; k<nz ; k++){
+                klap = k+LAP;
+                k_real = k + k_off;
+                r = r0 + d_r * k_real;
+                for(i=0;i<nx;i++){
+                    ilap = i+LAP;
+                    *(pAyy + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = 0.0;
+                    *(pAzz + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r;
+                }
+            }
+        }
+    }else if(access(filename1, F_OK) == -1){ 
+        if(my_id == 0) printf("read 3D mesh data: OCFD3d-Mesh.dat ...\n");
+        
+        MPI_File_open(MPI_COMM_WORLD, "OCFD3d-Mesh.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
+        
+        MPI_Offset offset = 0;
+
+        read_3d1(tmp_file, offset, pAxx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAyy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAzz);
+    
+        MPI_File_close(&tmp_file);
+
+        exchange_boundary_xyz(pAxx);
+        exchange_boundary_xyz(pAyy);
+        exchange_boundary_xyz(pAzz);
+        
+        memcpy_All(pAxx , pAxx_d->ptr , pAxx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAyy , pAyy_d->ptr , pAyy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAzz , pAzz_d->ptr , pAzz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+        Comput_Jacobian3d();
+    }else{
+        //The file not exist
+        if(my_id == 0) printf("OCFD3d-Jacobi.dat is exit\nread 3D Jacobi data ...... ");
+        MPI_File tmp_file;
+        
+        MPI_File_open(MPI_COMM_WORLD, "OCFD3d-Jacobi.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
+        
+        MPI_Offset offset = 0;
+
+        read_3d1(tmp_file, offset, pAxx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAyy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAzz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAkx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAky);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAkz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAix);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAiy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAiz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAsx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAsy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAsz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAjac);
+        
+    
+        MPI_File_close(&tmp_file);
+
+        exchange_boundary_xyz(pAxx);
+        exchange_boundary_xyz(pAyy);
+        exchange_boundary_xyz(pAzz);
+        exchange_boundary_xyz(pAkx);
+        exchange_boundary_xyz(pAky);
+        exchange_boundary_xyz(pAkz);
+        exchange_boundary_xyz(pAix);
+        exchange_boundary_xyz(pAiy);
+        exchange_boundary_xyz(pAiz);
+        exchange_boundary_xyz(pAsx);
+        exchange_boundary_xyz(pAsy);
+        exchange_boundary_xyz(pAsz);
+        exchange_boundary_xyz(pAjac);
+    
+        memcpy_All(pAxx , pAxx_d->ptr , pAxx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAyy , pAyy_d->ptr , pAyy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAzz , pAzz_d->ptr , pAzz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+        memcpy_All(pAkx , pAkx_d->ptr , pAkx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAky , pAky_d->ptr , pAky_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAkz , pAkz_d->ptr , pAkz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAix , pAix_d->ptr , pAix_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAiy , pAiy_d->ptr , pAiy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAiz , pAiz_d->ptr , pAiz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAsx , pAsx_d->ptr , pAsx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAsy , pAsy_d->ptr , pAsy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAsz , pAsz_d->ptr , pAsz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAjac , pAjac_d->ptr , pAjac_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+    }
+
+    ana_Jac();
+}
+
+
+void Comput_Jacobian3d(){
+
+    //boundary_Jac3d_Axx(); //only using the  boudary condition for Axx, Ayy, Azz
+
+    if (my_id == 0)
+        printf("Comput Jacobian 3D data ...\n");
+    
+    comput_Jac3d();
+    if (my_id == 0)
+        printf("Comput Jacobian 3D data OK\n");
+
+    // ---------------
+    exchange_boundary_xyz_packed_dev(pAkx  , pAkx_d);
+    exchange_boundary_xyz_packed_dev(pAky  , pAky_d);
+    exchange_boundary_xyz_packed_dev(pAkz  , pAkz_d);
+    exchange_boundary_xyz_packed_dev(pAix  , pAix_d);
+    exchange_boundary_xyz_packed_dev(pAiy  , pAiy_d);
+    exchange_boundary_xyz_packed_dev(pAiz  , pAiz_d);
+    exchange_boundary_xyz_packed_dev(pAsx  , pAsx_d);
+    exchange_boundary_xyz_packed_dev(pAsy  , pAsy_d);
+    exchange_boundary_xyz_packed_dev(pAsz  , pAsz_d);
+    exchange_boundary_xyz_packed_dev(pAjac , pAjac_d);
+
+    //boundary_Jac3d_Ajac(); //boudary condition for Axx, Ayy, Azz, Aix, Aiy, Aiz , ......
+
+}
+
+// ----------------------------------------------------------------------------
+
+__global__ void comput_Jac3d_kernal(
+    cudaField xi,
+    cudaField xj,
+    cudaField xk,
+    cudaField yi,
+    cudaField yj,
+    cudaField yk,
+    cudaField zi,
+    cudaField zj,
+    cudaField zk,
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    cudaJobPackage job
+){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL xi1, xj1, xk1, yi1, yj1, yk1, zi1, zj1, zk1, Jac1;
+        xi1 = get_Field(xi, x, y, z);
+        xj1 = get_Field(xj, x, y, z);
+        xk1 = get_Field(xk, x, y, z);
+        yi1 = get_Field(yi, x, y, z);
+        yj1 = get_Field(yj, x, y, z);
+        yk1 = get_Field(yk, x, y, z);
+        zi1 = get_Field(zi, x, y, z);
+        zj1 = get_Field(zj, x, y, z);
+        zk1 = get_Field(zk, x, y, z);
+        Jac1 = 1.0 / (xi1 * yj1 * zk1 + yi1 * zj1 * xk1 + zi1 * xj1 * yk1 - zi1 * yj1 * xk1 - yi1 * xj1 * zk1 - xi1 * zj1 * yk1); //1./Jocabian = d(x,y,z)/d(i,j,k)
+        get_Field_LAP(Ajac , x+LAP , y+LAP , z+LAP) = Jac1;
+        get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP) = (yj1 * zk1 - zj1 * yk1);
+        get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP) = (zj1 * xk1 - xj1 * zk1);
+        get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP) = (xj1 * yk1 - yj1 * xk1);
+        get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP) = (yk1 * zi1 - zk1 * yi1);
+        get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP) = (zk1 * xi1 - xk1 * zi1);
+        get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP) = (xk1 * yi1 - yk1 * xi1);
+        get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP) = (yi1 * zj1 - zi1 * yj1);
+        get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP) = (zi1 * xj1 - xi1 * zj1);
+        get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP) = (xi1 * yj1 - yi1 * xj1);
+
+
+        if(x == 0){
+            for(int i = 0; i < LAP; i++){
+                get_Field_LAP(Ajac, i, y+LAP, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx, i, y+LAP, z+LAP) =  get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky, i, y+LAP, z+LAP) =  get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz, i, y+LAP, z+LAP) =  get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix, i, y+LAP, z+LAP) =  get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy, i, y+LAP, z+LAP) =  get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz, i, y+LAP, z+LAP) =  get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx, i, y+LAP, z+LAP) =  get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy, i, y+LAP, z+LAP) =  get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz, i, y+LAP, z+LAP) =  get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(x == job.end.x-1){
+            for(int i = 1; i <= LAP; i++){
+                get_Field_LAP(Ajac, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(y == 0){
+            for(int i = 0; i < LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, i, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx, x+LAP, i, z+LAP) =  get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky, x+LAP, i, z+LAP) =  get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz, x+LAP, i, z+LAP) =  get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix, x+LAP, i, z+LAP) =  get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy, x+LAP, i, z+LAP) =  get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz, x+LAP, i, z+LAP) =  get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx, x+LAP, i, z+LAP) =  get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy, x+LAP, i, z+LAP) =  get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz, x+LAP, i, z+LAP) =  get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+
+        if(y == job.end.y-1){
+            for(int i = 1; i <= LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(z == 0){
+            for(int i = 0; i < LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, y+LAP, i) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP, y+LAP, i) =  get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP, y+LAP, i) =  get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP, y+LAP, i) =  get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP, y+LAP, i) =  get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP, y+LAP, i) =  get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP, y+LAP, i) =  get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP, y+LAP, i) =  get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP, y+LAP, i) =  get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP, y+LAP, i) =  get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(z == job.end.z-1){
+            for(int i = 1; i <= LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+	}
+}
+
+void comput_Jac3d()
+{
+    cudaField xi; xi.ptr = puk_d->ptr; xi.pitch = puk_d->pitch;
+    cudaField xj; xj.ptr = pui_d->ptr; xj.pitch = pui_d->pitch;
+    cudaField xk; xk.ptr = pus_d->ptr; xk.pitch = pus_d->pitch;
+    cudaField yi; yi.ptr = pvk_d->ptr; yi.pitch = pvk_d->pitch;
+    cudaField yj; yj.ptr = pvi_d->ptr; yj.pitch = pvi_d->pitch;
+    cudaField yk; yk.ptr = pvs_d->ptr; yk.pitch = pvs_d->pitch;
+    cudaField zi; zi.ptr = pwk_d->ptr; zi.pitch = pwk_d->pitch;
+    cudaField zj; zj.ptr = pwi_d->ptr; zj.pitch = pwi_d->pitch;
+    cudaField zk; zk.ptr = pws_d->ptr; zk.pitch = pws_d->pitch;
+
+	cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+    
+    OCFD_dx0_jac(*pAxx_d, xi, job, BlockDim_X, &Stream[0], Jacbound[0]);
+    OCFD_dx0_jac(*pAyy_d, yi, job, BlockDim_X, &Stream[0], Jacbound[0]);
+    OCFD_dx0_jac(*pAzz_d, zi, job, BlockDim_X, &Stream[0], Jacbound[0]);
+    OCFD_dy0_jac(*pAxx_d, xj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
+    OCFD_dy0_jac(*pAyy_d, yj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
+    OCFD_dy0_jac(*pAzz_d, zj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
+    OCFD_dz0_jac(*pAxx_d, xk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
+    OCFD_dz0_jac(*pAyy_d, yk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
+    OCFD_dz0_jac(*pAzz_d, zk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
+
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX/8 , BlockDimY , BlockDimZ , nx,ny,nz);
+    job.setup( dim3(0,0,0) , dim3(nx,ny,nz) );
+
+    CUDA_LAUNCH(( hipLaunchKernelGGL(comput_Jac3d_kernal, dim3(griddim ), dim3(blockdim), 0, 0, xi,xj,xk,yi,yj,yk,zi,zj,zk,*pAkx_d,
+            *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,job) ));
+
+}
+
+// ------------------------------------------------------------------------
+// Symmetry bounary at j=1 & j=ny_global
+
+__global__ void boundary_Jac3d_kernal_y_r(cudaField pA, REAL value, cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(pA, x, y, z) =  value*get_Field_LAP(pA, x, 2*(ny_lap_d-1) - y, z);
+    }
+}
+
+__global__ void boundary_Jac3d_kernal_y_l(cudaField pA, REAL value, cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(pA, x, y, z) =  value*get_Field_LAP(pA, x, 2*LAP - y, z);
+    }
+}
+
+
+__global__ void boundary_Jac3d_kernal_y_ramp_wall_kernel(
+    cudaField xx, 
+    cudaField yy, 
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    REAL seta,
+    cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        if( get_Field_LAP(xx, x, LAP, z) <= 0.0){
+            get_Field_LAP(Ajac, x, y, z) = get_Field_LAP(Ajac, x, 2*LAP-y, z);
+            get_Field_LAP(Akx, x, y, z) =  get_Field_LAP(Akx, x, 2*LAP-y, z);
+            get_Field_LAP(Aky, x, y, z) = -get_Field_LAP(Aky, x, 2*LAP-y, z);
+            get_Field_LAP(Akz, x, y, z) =  get_Field_LAP(Akz, x, 2*LAP-y, z);
+            get_Field_LAP(Aix, x, y, z) = -get_Field_LAP(Aix, x, 2*LAP-y, z);
+            get_Field_LAP(Aiy, x, y, z) =  get_Field_LAP(Aiy, x, 2*LAP-y, z);
+            get_Field_LAP(Aiz, x, y, z) = -get_Field_LAP(Aiz, x, 2*LAP-y, z);
+            get_Field_LAP(Asx, x, y, z) =  get_Field_LAP(Asx, x, 2*LAP-y, z);
+            get_Field_LAP(Asy, x, y, z) = -get_Field_LAP(Asy, x, 2*LAP-y, z);
+            get_Field_LAP(Asz, x, y, z) =  get_Field_LAP(Asz, x, 2*LAP-y, z);
+
+        }else{
+
+            REAL dx = get_Field_LAP(xx, x, 2*LAP-y, z) - get_Field_LAP(xx, x, 2*LAP-y-1, z);
+            REAL dy = get_Field_LAP(yy, x, 2*LAP-y, z) - get_Field_LAP(yy, x, 2*LAP-y-1, z);
+
+            REAL tmpxx = fabs(-cos(2*seta) + sin(2*seta)*dy/dx);
+            REAL tmpyy = fabs(cos(2*seta) + dx/dy*sin(2*seta));
+
+            get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpyy*get_Field_LAP(Ajac, x, 2*LAP-y, z);
+            get_Field_LAP(Akx, x, y, z) =  tmpyy*get_Field_LAP(Akx, x, 2*LAP-y, z);
+            get_Field_LAP(Aky, x, y, z) = -tmpxx*get_Field_LAP(Aky, x, 2*LAP-y, z);
+            get_Field_LAP(Akz, x, y, z) =  tmpxx*tmpyy*get_Field_LAP(Akz, x, 2*LAP-y, z);
+            get_Field_LAP(Aix, x, y, z) = -tmpyy*get_Field_LAP(Aix, x, 2*LAP-y, z);
+            get_Field_LAP(Aiy, x, y, z) =  tmpxx*get_Field_LAP(Aiy, x, 2*LAP-y, z);
+            get_Field_LAP(Aiz, x, y, z) = -tmpxx*tmpyy*get_Field_LAP(Aiz, x, 2*LAP-y, z);
+            get_Field_LAP(Asx, x, y, z) =  tmpyy*get_Field_LAP(Asx, x, 2*LAP-y, z);
+            get_Field_LAP(Asy, x, y, z) = -tmpxx*get_Field_LAP(Asy, x, 2*LAP-y, z);
+            get_Field_LAP(Asz, x, y, z) =  tmpxx*tmpyy*get_Field_LAP(Asz, x, 2*LAP-y, z);
+
+        }
+    }
+}
+
+void boundary_Jac3d_kernal_y_ramp_wall(REAL seta){
+    if (npy == 0)
+    {
+        seta = seta/PI;
+        dim3 griddim , blockdim;
+        cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
+
+        CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_ramp_wall_kernel, dim3(griddim ), dim3(blockdim), 0, 0, *pAxx_d,*pAyy_d,*pAkx_d,
+            *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,seta,job) ));
+    }
+}
+
+__global__ void boundary_Jac3d_kernal_z_cone_wall_kernel(
+    cudaField xx, 
+    cudaField zz, 
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    REAL seta1,
+    REAL seta2,
+    cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        if( get_Field_LAP(xx, x, y, LAP) <= 0.0){
+
+            REAL dx = get_Field_LAP(xx, x, y, 2*LAP-z) - get_Field_LAP(xx, x, y, 2*LAP-z-1);
+            REAL dz = get_Field_LAP(zz, x, y, 2*LAP-z) - get_Field_LAP(zz, x, y, 2*LAP-z-1);
+
+            REAL tmpxx = fabs(-cos(2*seta1) + sin(2*seta1)*dz/dx);
+            REAL tmpzz = fabs(cos(2*seta1) + dx/dz*sin(2*seta1));
+
+            get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Ajac, x, y, 2*LAP-z);
+            get_Field_LAP(Akx, x, y, z) =  tmpzz*get_Field_LAP(Akx, x, y, 2*LAP-z);
+            get_Field_LAP(Aky, x, y, z) = -tmpxx*tmpzz*get_Field_LAP(Aky, x, y, 2*LAP-z);
+            get_Field_LAP(Akz, x, y, z) = -tmpxx*get_Field_LAP(Akz, x, y, 2*LAP-z);
+            get_Field_LAP(Aix, x, y, z) = -tmpzz*get_Field_LAP(Aix, x, y, 2*LAP-z);
+            get_Field_LAP(Aiy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Aiy, x, y, 2*LAP-z);
+            get_Field_LAP(Aiz, x, y, z) =  tmpxx*get_Field_LAP(Aiz, x, y, 2*LAP-z);
+            get_Field_LAP(Asx, x, y, z) = -tmpzz*get_Field_LAP(Asx, x, y, 2*LAP-z);
+            get_Field_LAP(Asy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Asy, x, y, 2*LAP-z);
+            get_Field_LAP(Asz, x, y, z) =  tmpxx*get_Field_LAP(Asz, x, y, 2*LAP-z);
+            
+        }else{
+
+            REAL dx = get_Field_LAP(xx, x, y, 2*LAP-z) - get_Field_LAP(xx, x, y, 2*LAP-z-1);
+            REAL dz = get_Field_LAP(zz, x, y, 2*LAP-z) - get_Field_LAP(zz, x, y, 2*LAP-z-1);
+
+            REAL tmpxx = fabs(-cos(2*(seta1+seta2)) + sin(2*(seta1+seta2))*dz/dx);
+            REAL tmpzz = fabs(cos(2*(seta1+seta2)) + dx/dz*sin(2*(seta1+seta2)));
+
+            get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Ajac, x, y, 2*LAP-z);
+            get_Field_LAP(Akx, x, y, z) =  tmpzz*get_Field_LAP(Akx, x, y, 2*LAP-z);
+            get_Field_LAP(Aky, x, y, z) = -tmpxx*tmpzz*get_Field_LAP(Aky, x, y, 2*LAP-z);
+            get_Field_LAP(Akz, x, y, z) = -tmpxx*get_Field_LAP(Akz, x, y, 2*LAP-z);
+            get_Field_LAP(Aix, x, y, z) = -tmpzz*get_Field_LAP(Aix, x, y, 2*LAP-z);
+            get_Field_LAP(Aiy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Aiy, x, y, 2*LAP-z);
+            get_Field_LAP(Aiz, x, y, z) =  tmpxx*get_Field_LAP(Aiz, x, y, 2*LAP-z);
+            get_Field_LAP(Asx, x, y, z) = -tmpzz*get_Field_LAP(Asx, x, y, 2*LAP-z);
+            get_Field_LAP(Asy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Asy, x, y, 2*LAP-z);
+            get_Field_LAP(Asz, x, y, z) =  tmpxx*get_Field_LAP(Asz, x, y, 2*LAP-z);
+        }
+    }
+}
+
+void boundary_Jac3d_kernal_z_cone_wall(REAL seta1, REAL seta2){
+    if (npz == 0)
+    {
+        seta1 = seta1/PI;
+        seta2 = seta2/PI;
+        dim3 griddim , blockdim;
+        cudaJobPackage job( dim3(LAP, LAP, 0) , dim3(nx_lap, ny_lap, LAP) );
+
+        CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_z_cone_wall_kernel, dim3(griddim ), dim3(blockdim), 0, 0, *pAxx_d,*pAzz_d,*pAkx_d,
+            *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,seta1,seta2,job) ));
+    }
+}
+
+void boundary_Jac3d_Axx()
+{
+    if(IF_SYMMETRY == 1){
+         if (npy == 0)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAxx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAyy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAzz_d , 1.0 , job) ));
+        }
+    
+        if (npy == NPY0 - 1)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAxx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAyy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAzz_d , 1.0 , job) ));
+        }
+    }
+}
+
+
+
+void boundary_Jac3d_Liftbody_Ajac()
+{
+    if(IF_SYMMETRY == 1){
+        if (npy == 0)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAkx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAky_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAkz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAix_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAiy_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAiz_d ,-1.0 , job) ));
+    
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAsx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAsy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAsz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_l, dim3(griddim ), dim3(blockdim), 0, 0, *pAjac_d , 1.0 , job) ));
+        }
+    
+        if (npy == NPY0 - 1)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAkx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAky_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAkz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAix_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAiy_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAiz_d ,-1.0 , job) ));
+    
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAsx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAsy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAsz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_Jac3d_kernal_y_r, dim3(griddim ), dim3(blockdim), 0, 0, *pAjac_d , 1.0 , job) ));
+        }
+    }
+}
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src_hip/OCFD_IO.c
+++ b/src_hip/OCFD_IO.c
+//Read & save file
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "mpi.h"
+
+#include "OCFD_ana.h"
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_IO.h"
+#include "OCFD_IO_mpi.h"
+#include "io_warp.h"
+
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "commen_kernel.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+void read_file(
+	int Iflag_av,
+	REAL * pd,
+	REAL * pu,
+	REAL * pv,
+	REAL * pw,
+	REAL * pT)
+{
+
+	// Iflag_av == 0 , read opencfd data file; ==1, read averaged file
+	int Irestart_step;
+	char filename1[100];
+	//-----------------------------------------------------------
+    if(Iflag_av == 0){
+	    Irestart_step = -1;
+	    if (my_id == 0)
+	    {
+	    	FILE *tmp_file;
+	    	if (tmp_file = fopen("Opencfd.msg", "r"))
+	    	{
+	    		fread(&Irestart_step, sizeof(int), 1, tmp_file);
+	    		fclose(tmp_file);
+	    	}
+	    	else
+	    	{
+	    		printf("Opencfd.msg is not exist, read initial file : opencfd.dat ......\n");
+	    	}
+	    }
+    
+	    MPI_Bcast(&Irestart_step, 1, MPI_INT, 0, MPI_COMM_WORLD);
+	    if (Irestart_step < 0)
+	    {
+	    	sprintf(filename1, "opencfd.dat");
+	    }
+	    else
+	    {
+	    	sprintf(filename1, "OCFD%08d.dat", Irestart_step);
+	    }
+	    MPI_File tmp_file;
+	    int tmp[3];
+	    
+	    if(my_id == 0) printf("read initial data file: %s \n\n", filename1);
+	    MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
+        
+	    MPI_File_read_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+        MPI_File_read_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+
+		MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
+	
+	    Istep = tmp[0];
+	    tt = *(REAL*)(tmp+1);
+	    if(my_id == 0) printf("Istep=%d , tt=%lf\n", Istep, tt);
+
+	    read_3d1(tmp_file, offset, pd);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pu);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pv);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pw);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pT);
+
+        MPI_File_close(&tmp_file);
+	    //------------------------
+	    if (my_id == 0)
+	    	printf("read data ok\n");
+    }
+
+	//--------------------
+	if(Iflag_av == 1)
+	{
+	    // averaged file
+	    	//char *tmp_char = strstr(filename1, ".dat");
+
+		sprintf(filename1, "opencfd.average");
+
+		if (access(filename1, F_OK) == -1){ 
+
+			//The file not exist
+      		if(my_id == 0) printf("Average file: %s is not exit\n\n", filename1);
+      		Istep_average = 0;
+      		tt_average = 0.0;
+
+            init_time_average();
+     	}else{
+	        if (my_id == 0)
+			printf("read average_data begin\n");
+
+			MPI_File tmp_file;
+			MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);  
+            int tmp[3];
+	    
+			MPI_File_read_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+        	MPI_File_read_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+
+			MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
+
+            Istep_average = tmp[0];
+            tt_average = *(REAL*)(tmp+1);
+            if(my_id == 0) printf("Istep_average=%d , tt_average=%lf\n", Istep_average, tt_average);
+
+			init_time_average();
+
+            read_3d1(tmp_file, offset, pdm);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pum);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pvm);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pwm);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pTm);
+
+            MPI_File_close(&tmp_file);
+    
+	//--    ----------------------
+	        if (my_id == 0)
+				printf("read average_data ok\n");
+				
+			memcpy_inner(pdm , pdm_d->ptr , pdm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pum , pum_d->ptr , pum_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pvm , pvm_d->ptr , pvm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pwm , pwm_d->ptr , pwm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pTm , pTm_d->ptr , pTm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+		}
+	    average_IO = 0;
+	}
+	//---------------------
+}
+//----------------------------------------------------------------------------------
+
+//================================================================================
+void OCFD_save(
+	int Iflag_av,
+	int Istep_name,
+	REAL * pd,
+	REAL * pu,
+	REAL * pv,
+	REAL * pw,
+	REAL * pT)
+{
+							    
+	// Iflag_av==0, write opencfd file; ==1, write averaged data file
+
+	char filename1[120];
+	//-------------------------------------------
+	MPI_File tmp_file;
+	int tmp[3];
+	int size_tmp = sizeof(tmp);
+
+        if(Iflag_av == 0){
+            sprintf(filename1, "OCFD%08d.dat", Istep_name);
+		}else{
+		    sprintf(filename1, "OCFD%08d.average", Istep_name);
+		}
+        if(my_id == 0) printf("write data file: %s\n", filename1);
+	
+	MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &tmp_file);
+
+	if(Iflag_av == 0){
+            
+	    tmp[0] = Istep;
+        *(REAL*)(tmp + 1) = tt;
+
+	    MPI_File_write_at_all(tmp_file, 0, &size_tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int)+sizeof(REAL), &size_tmp, 1, MPI_INT, &status);
+	}else{
+            
+	    tmp[0] = Istep_average;
+        *(REAL*)(tmp + 1) = tt_average;
+	    
+	    MPI_File_write_at_all(tmp_file, 0, &size_tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int)+sizeof(REAL), &size_tmp, 1, MPI_INT, &status);
+	}
+
+	MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
+
+	write_3d1(tmp_file, offset, pd);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pu);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pv);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pw);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pT);
+
+    MPI_File_close(&tmp_file);
+
+	//if (my_id == 0)
+	//{
+	//	if (Iflag_av == 0)
+	//	{
+	//		printf("write data OK\n");
+	//		tmp_file = fopen("Opencfd.msg", "a");
+	//		fprintf(tmp_file, "%d", Istep_name);
+	//		fclose(tmp_file);
+	//	}
+	//}
+}
+//-------------------------------------------------------------------------------------------
+
+//---------------------------------------------------------------------------------------------
+void write_3d1(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+	int i, j, k;
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL(*U1)
+	[ny][nx] = (REAL(*)[ny][nx])malloc(nx * ny * nz * sizeof(REAL));
+	REAL *pU1 = (REAL*)U1;
+	for (k = LAP; k < nz + LAP; k++)
+	{
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				(*pU1++) = U[k][j][i];
+			}
+		}
+	}
+	pU1 = &(U1[0][0][0]);
+	write_3d(file, offset, pU1);
+	free(U1);
+}
+
+void read_3d1(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+
+	int i, j, k;
+	REAL(*U1)
+	[ny][nx] = (REAL(*)[ny][nx])malloc(nx * ny * nz * sizeof(REAL));
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL *pU1 = (REAL*)U1;
+	read_3d(file, offset, pU1);
+	for (k = LAP; k < nz + LAP; k++)
+	{
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U[k][j][i] = (*pU1++);
+			}
+		}
+	}
+	free(U1);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/src_hip/OCFD_IO_mpi.c
+++ b/src_hip/OCFD_IO_mpi.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpi.h"
+
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_mpi.h"
+#include "io_warp.h"
+
+//---------------------------------------------------
+#ifdef __cplusplus
+extern "C"{
+#endif
+//void write_2d_XYa(
+//	FILE *file,
+//	int ka,
+//	int size_x,
+//	int size_y,
+//	int lap,
+//	int *pU)
+//{
+//
+//	int(*U)
+//	[size_y + 2*lap][size_x + 2*lap] = (int(*)[size_y + 2*lap][size_x + 2*lap])(pU);
+//	int(*U2d)
+//	[NX_GLOBAL], (*U0)[NX_GLOBAL];
+//	int node_k, k_local;
+//
+//	U2d = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+//	memset((void*)U2d, 0, NX_GLOBAL * NY_GLOBAL * sizeof(int));
+//	if (my_id == 0){
+//		U0 = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+//	}
+//	//--------------------------------
+//	get_k_node(ka, &node_k, &k_local);
+//	k_local += lap;
+//	int i, j;
+//	if(npz == node_k){
+//		for (j = lap; j < ny + lap; j++)
+//		{
+//			for (i = lap; i < nx + lap; i++)
+//			{
+//				U2d[j - lap + j_offset[npy]][i - lap + i_offset[npx]] = U[k_local][j][i];
+//			}
+//		}
+//	}
+//	MPI_Reduce(&U2d[0][0], &U0[0][0], NX_GLOBAL * NY_GLOBAL, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+////	if (my_id == 0)
+////		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+//
+//	if(my_id == 0){
+//		for(j = 0; j < NY_GLOBAL; j++){
+//			for(i = 0; i < NX_GLOBAL; i++){
+//				fprintf(file, "%08d\n", U0[j][i]);
+//			}
+//		}
+//	}
+//
+//	free(U2d);
+//	if (my_id == 0)
+//		free(U0);
+//}
+void write_2d_XY(
+	FILE *file,
+	int ka,
+	int size_x,
+	int size_y,
+	int lap,
+	int *pU,
+	REAL *pU1)
+{
+
+	int(*U)[size_y + 2*lap][size_x + 2*lap] = (int(*)[size_y + 2*lap][size_x + 2*lap])(pU);
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	int(*U2d)[NX_GLOBAL], (*U0)[NX_GLOBAL];
+	REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
+	int node_k, k_local;
+
+	U2d = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+	U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+	memset((void*)U2d, 0, NX_GLOBAL * NY_GLOBAL * sizeof(int));
+	memset((void*)U2d1, 0, NX_GLOBAL * NY_GLOBAL * sizeof(REAL));
+	if (my_id == 0){
+		U0 = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+		U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+	}
+	//--------------------------------
+	get_k_node(ka, &node_k, &k_local);
+	k_local += lap;
+	int i, j;
+	if(npz == node_k){
+		for (j = lap; j < ny + lap; j++)
+		{
+			for (i = lap; i < nx + lap; i++)
+			{
+				U2d[j - lap + j_offset[npy]][i - lap + i_offset[npx]] = U[k_local][j][i];
+			}
+		}
+
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U2d1[j - LAP + j_offset[npy]][i - LAP + i_offset[npx]] = U1[k_local + LAP][j][i];
+			}
+		}
+	}
+	MPI_Reduce(&U2d[0][0], &U0[0][0], NX_GLOBAL * NY_GLOBAL, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	if(my_id == 0){
+		for(j = 0; j < NY_GLOBAL; j++){
+			for(i = 0; i < NX_GLOBAL; i++){
+				fprintf(file, "%08d%15.6lf\n", U0[j][i], U01[j][i]);
+			}
+		}
+	}
+
+	free(U2d);
+	free(U2d1);
+	if (my_id == 0){
+		free(U0);
+		free(U01);
+	}
+}
+
+
+void write_2d_XYa(
+	FILE *file,
+	int ka,
+	REAL *pU1)
+{
+
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
+	int node_k, k_local;
+
+	U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+
+	memset((void*)U2d1, 0, NX_GLOBAL * NY_GLOBAL * sizeof(REAL));
+
+	if (my_id == 0){
+		U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+	}
+	//--------------------------------
+	get_k_node(ka, &node_k, &k_local);
+
+	int i, j;
+	if(npz == node_k){
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U2d1[j - LAP + j_offset[npy]][i - LAP + i_offset[npx]] = U1[k_local + LAP][j][i];
+			}
+		}
+	}
+
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	if(my_id == 0) FWRITE(U01, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+
+	free(U2d1);
+	if (my_id == 0){
+		free(U01);
+	}
+}
+
+void write_2d_YZa(
+	FILE *file,
+	int ia,
+	REAL *pU1)
+{
+
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	REAL(*U2d1)[NY_GLOBAL], (*U01)[NY_GLOBAL];
+	int node_i, i_local;
+
+	U2d1 = (REAL(*)[NY_GLOBAL])malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
+
+	memset((void*)U2d1, 0, NY_GLOBAL * NZ_GLOBAL * sizeof(REAL));
+
+	if (my_id == 0){
+		U01 = (REAL(*)[NY_GLOBAL])malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
+	}
+	//--------------------------------
+	get_i_node(ia, &node_i, &i_local);
+
+	int j, k;
+	if(npx == node_i){
+		for (k = LAP; k < nz + LAP; k++)
+		{
+			for (j = LAP; j < ny + LAP; j++)
+			{
+				U2d1[k - LAP + k_offset[npz]][j - LAP + j_offset[npy]] = U1[k][j][i_local + LAP];
+			}
+		}
+	}
+
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NY_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	if(my_id == 0) FWRITE(U01, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	free(U2d1);
+	if (my_id == 0){
+		free(U01);
+	}
+}
+
+
+void write_2d_XZa(
+	FILE *file,
+	int ja,
+	REAL *pU1)
+{
+
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
+	int node_j, j_local;
+
+	U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NZ_GLOBAL);
+
+	memset((void*)U2d1, 0, NX_GLOBAL * NZ_GLOBAL * sizeof(REAL));
+
+	if (my_id == 0){
+		U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NZ_GLOBAL);
+	}
+	//--------------------------------
+	get_j_node(ja, &node_j, &j_local);
+
+	int i, k;
+	if(npy == node_j){
+		for (k = LAP; k < nz + LAP; k++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U2d1[k - LAP + k_offset[npz]][i - LAP + i_offset[npx]] = U1[k][j_local + LAP][i];
+			}
+		}
+	}
+
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	if(my_id == 0) FWRITE(U01, sizeof(REAL), NX_GLOBAL * NZ_GLOBAL, file)
+
+	free(U2d1);
+	if (my_id == 0){
+		free(U01);
+	}
+}
+
+
+//--------------------------------------------------------------
+//-----Write a 2D Y-Z (j-k) plane from 3-D array
+//void write_2d_YZa(
+//	FILE *file,
+//	int ia,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+//	REAL(*U2d), (*U0);
+//	int node_i, i_local;
+//
+//	U2d = (REAL *)malloc(sizeof(REAL) * ny * nz);
+//	if (my_id == 0)
+//		U0 = (REAL *)malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
+//	//--------------------------------
+//	get_i_node(ia, &node_i, &i_local);
+//	i_local += LAP;
+//	int k, j;
+//	REAL *tmp = U2d;
+//	for (k = LAP; k < nz + LAP; k++)
+//	{
+//		for (j = LAP; j < ny + LAP; j++)
+//		{
+//			(*tmp++) = U[k][j][i_local];
+//		}
+//	}
+//
+//	for (int proc_k = 0; proc_k < NPZ0; k++)
+//	{
+//		for (int kk = k_offset[proc_k]; kk < k_offset[proc_k] + k_nn[proc_k]; kk++)
+//		{
+//			for (int proc_j = 0; proc_j < NPY0; proc_j++)
+//			{
+//				if (npx == node_i && npy == proc_j && npz == proc_k)
+//				{
+//					k = kk - k_offset[proc_k];
+//					MPI_Bsend(U2d + k * ny, ny, OCFD_DATA_TYPE, 0, kk, MPI_COMM_WORLD);
+//				}
+//				if (my_id == 0)
+//				{
+//					int recv_offset = j_offset[proc_j] + NY_GLOBAL * kk;
+//					MPI_Status status;
+//					MPI_Recv(U0 + recv_offset, j_nn[proc_j], OCFD_DATA_TYPE, PROCIdx2Num(node_i, proc_j, proc_k), kk, MPI_COMM_WORLD, &status);
+//				}
+//			}
+//			MPI_Barrier(MPI_COMM_WORLD);
+//		}
+//	}
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+//
+//	free(U2d);
+//	if (my_id == 0)
+//		free(U0);
+//}
+
+//-------------------------------------------------
+//----Write a 2d xz-plane from 3d array------------------------
+
+//void write_2d_XZa(
+//	FILE *file,
+//	int ja,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+//	REAL(*U2d), (*U0);
+//	int node_j, j_local;
+//
+//	U2d = (REAL *)malloc(sizeof(REAL) * nx * nz);
+//	if (my_id == 0)
+//		U0 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+//	//--------------------------------
+//	get_j_node(ja, &node_j, &j_local);
+//	j_local += LAP;
+//	int k, i;
+//	REAL *tmp = U2d;
+//	for (k = LAP; k < nz + LAP; k++)
+//	{
+//		for (i = LAP; i < nx + LAP; i++)
+//		{
+//			(*tmp++) = U[k][j_local][i];
+//		}
+//	}
+//	for (int proc_k = 0; proc_k < NPZ0; k++)
+//	{
+//		for (int kk = k_offset[proc_k]; kk < k_offset[proc_k] + k_nn[proc_k]; kk++)
+//		{
+//			for (int proc_i = 0; proc_i < NPX0; proc_i++)
+//			{
+//				if (npy == node_j && npx == proc_i && npz == proc_k)
+//				{
+//					k = kk - k_offset[proc_k];
+//					MPI_Bsend(U2d + k * nx, nx, OCFD_DATA_TYPE, 0, kk, MPI_COMM_WORLD);
+//				}
+//				if (my_id == 0)
+//				{
+//					int recv_offset = i_offset[proc_i] + NX_GLOBAL * kk;
+//					MPI_Status status;
+//					MPI_Recv(U0 + recv_offset, i_nn[proc_i], OCFD_DATA_TYPE, PROCIdx2Num(proc_i, node_j, proc_k), kk, MPI_COMM_WORLD, &status);
+//				}
+//			}
+//			MPI_Barrier(MPI_COMM_WORLD);
+//		}
+//	}
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NX_GLOBAL * NZ_GLOBAL, file)
+//
+//	free(U2d);
+//	if (my_id == 0)
+//		free(U0);
+//}
+//--------------------------------------------------
+
+//----Write points from 3d array------------------------
+// 需要明确外界输入文件中，ia，ja，ka所使用的下标体系
+void write_points(
+	FILE *file,
+	REAL *pU,
+	int mpoints,
+	int *ia,
+	int *ja,
+	int *ka)
+{
+	int node_i, node_j, node_k, i_local, j_local, k_local;
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL *U1;
+	U1 = (REAL *)malloc(sizeof(REAL) * mpoints);
+	//--------------------------------
+	for (int m = 0; m < mpoints; m++)
+	{
+		get_i_node(ia[m], &node_i, &i_local);
+		get_j_node(ja[m], &node_j, &j_local);
+		get_k_node(ka[m], &node_k, &k_local);
+		if (npx == node_i && npy == node_j && npz == node_k)
+		{
+			MPI_Bsend(&U[k_local + LAP][j_local + LAP][i_local + LAP], 1, OCFD_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+		}
+		if (my_id == 0)
+		{
+			MPI_Status status;
+			MPI_Recv(&U1[m], 1, OCFD_DATA_TYPE, PROCIdx2Num(node_i, node_j, node_k), 0, MPI_COMM_WORLD, &status);
+		}
+	}
+	if (my_id == 0)
+		FWRITE(U1, sizeof(REAL), mpoints, file)
+	free(U1);
+}
+
+//--------------------------------------------------
+
+//--------------------------------------------------
+//void read_3d(
+//	FILE *file,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny][nx] = PTR2ARRAY2(pU, nx, ny);
+//
+//	REAL(*buff2d)
+//	[NX_GLOBAL], (*buff1)[NX_GLOBAL], *buff2, *buff_recv;
+//	int sendcounts1[NPY0], displs1[NPY0], sendcounts2[NPX0], displs2[NPX0];
+//	//---------------------------------------------------------------
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			buff2d = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+//		}
+//		buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//		buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);//NY_GLOBAL > ny
+//	}
+//	buff_recv = (REAL *)malloc(sizeof(REAL) * nx * ny);
+//
+//	if (my_id == 0)
+//		printf("read 3d data ...\n");
+//	// sendcounts1 displs1用于j方向分布
+//	for (int j = 0; j < NPY0; j++)
+//	{
+//		sendcounts1[j] = NX_GLOBAL * j_nn[j];
+//		displs1[j] = j_offset[j] * NX_GLOBAL;
+//	}
+//
+//	for (int i = 0; i < NPX0; i++)
+//	{
+//		sendcounts2[i] = ny * i_nn[i];
+//		displs2[i] = i_offset[i] * ny;
+//	}
+//
+//	int proc_k, k_local;
+//	for (int kk = 0; kk < NZ_GLOBAL; kk++)
+//	{
+//		get_k_node(kk, &proc_k, &k_local);
+//		if (my_id == 0)
+//			FREAD(buff2d, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+//
+//		if (proc_k != 0)
+//		{
+//			// k方向发送
+//			MPI_Status status;
+//			if (my_id == 0)
+//				MPI_Send(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, proc_k * (NPX0 * NPY0), 6666, MPI_COMM_WORLD);
+//			if (my_id == proc_k * NPX0 * NPY0)
+//				MPI_Recv(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, 6666, MPI_COMM_WORLD, &status);
+//		}
+//		if (npz == proc_k)
+//		{
+//			// j方向分散
+//			if (npx == 0)
+//			{
+//				MPI_Scatterv(buff2d, sendcounts1, displs1, OCFD_DATA_TYPE, buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
+//
+//				REAL *pbuff_recv;
+//				REAL *ppU;
+//				// i方向数据准备与离散
+//				for (int npx1 = 0; npx1 < NPX0; npx1++)
+//				{
+//					ppU = buff2 + displs2[npx1];
+//					for (int j = 0; j < ny; j++)
+//					{
+//						for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
+//						{
+//							(*ppU++) = buff1[j][i];
+//						}
+//					}
+//				}
+//			}
+//			//buff_recv = buff2;
+//			MPI_Scatterv(buff2, sendcounts2, displs2, OCFD_DATA_TYPE, buff_recv, nx * ny, OCFD_DATA_TYPE, 0, MPI_COMM_X);
+//
+//			// 数据分布
+//			{
+//				REAL *pbuff_recv;
+//				REAL *ppU;
+//				ppU = pU + k_local * nx * ny;
+//				pbuff_recv = buff_recv;
+//				for (int nn = 0; nn < nx * ny; nn++)
+//				{
+//					(*ppU++) = (*pbuff_recv++);
+//				}
+//			}
+//		}
+//	}
+//
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			free(buff2d);
+//		}
+//		free(buff1);
+//		free(buff2);
+//	}
+//	free(buff_recv);
+//}
+
+//void read_3d(
+//	MPI_File file,
+//	REAL *pU)
+//{
+//	size_t displs_start, displs_end, displs_k_start, displs_k_end;
+//
+//    REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//
+//    displs_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
+//    displs_end = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * (NZ_GLOBAL-k_offset[npz]-nz);
+//
+//	displs_k_start = sizeof(int) + (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
+//	displs_k_end = 2*sizeof(int) + (NY_GLOBAL - ny) * NX_GLOBAL * sizeof(REAL);
+//
+//    if (my_id == 0) printf("read 3d data ...\n");
+//
+//    MPI_File_seek(file, displs_start + displs_k_start, MPI_SEEK_CUR);
+//
+//    for(int k=0; k<nz; k++){
+//
+//        MPI_File_read(file, buff_recv, NX_GLOBAL*ny, OCFD_DATA_TYPE, &status);
+//
+//        MPI_File_seek(file, displs_k_end, MPI_SEEK_CUR);
+//
+//    // 数据分布
+//       {
+//           REAL *ppU;
+//       	   ppU = pU + k * nx * ny;
+//           
+//           for(int j=0;j<ny;j++){
+//               for(int i=0;i<nx;i++){
+//           	     *(ppU+j*nx+i) = *(buff_recv+j*NX_GLOBAL+i);
+//               }
+//           }
+//       }
+//    }
+//
+//    MPI_File_seek(file, displs_end - displs_k_start, MPI_SEEK_CUR);		 
+//
+//    free(buff_recv);
+//}
+
+void read_3d(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+	size_t displs_start, displs_k_start;
+
+    REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+
+    displs_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
+
+	displs_k_start = sizeof(int) + (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
+
+	displs_start += displs_k_start + offset;
+
+    if (my_id == 0) printf("read 3d data ...\n");
+
+    for(int k=0; k<nz; k++){
+
+        MPI_File_read_at(file, displs_start, buff_recv, NX_GLOBAL*ny, OCFD_DATA_TYPE, &status);
+
+		displs_start += 2*sizeof(int) + NY_GLOBAL * NX_GLOBAL * sizeof(REAL);
+
+
+    // 数据分布
+       {
+           REAL *ppU;
+       	   ppU = pU + k * nx * ny;
+           
+           for(int j=0;j<ny;j++){
+               for(int i=0;i<nx;i++){
+           	     *(ppU+j*nx+i) = *(buff_recv+j*NX_GLOBAL+i);
+               }
+           }
+       }
+    }	 
+
+    free(buff_recv);
+}
+//------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------
+
+void write_3d(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+
+	REAL(*U)[ny][nx] = PTR2ARRAY2(pU, nx, ny);
+	REAL(*buff1)[NX_GLOBAL], *buff2, *buff_send;
+	int *buff2d;
+
+	size_t size = NX_GLOBAL*NY_GLOBAL*sizeof(REAL);
+	size_t displs_k;
+
+	int recvcounts1[NPY0], displs1[NPY0], recvcounts2[NPX0], displs2[NPX0];
+
+
+	displs_k = k_offset[npz] * (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) + offset;
+
+	if (npx == 0)
+	{
+		if (npy == 0)
+		{
+			buff2d = (int*)malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL + sizeof(int) * 2);
+			*buff2d = size;
+			*(buff2d + 1 + NX_GLOBAL * NY_GLOBAL * 2) = size;
+		}
+		buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
+		buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+	}
+	buff_send = (REAL *)malloc(sizeof(REAL) * nx * ny);
+
+	//---------------------------------------------------------------
+	if (my_id == 0)
+		printf("write 3d data ...\n");
+	// recvconts1 ， displs1 存储j方向收集时所使用的个数与偏移，
+	// 由于j方向收集发生在i方向收集之后，因此只有一列参与j方向收集
+	for (int j = 0; j < NPY0; j++)
+	{
+		recvcounts1[j] = NX_GLOBAL * j_nn[j];
+		displs1[j] = j_offset[j] * NX_GLOBAL;
+	}
+	// i方向收集所需偏移与数量
+	for (int i = 0; i < NPX0; i++)
+	{
+		recvcounts2[i] = ny * i_nn[i];
+		displs2[i] = i_offset[i] * ny;
+	}
+
+	// 按数据的k面进行循环	
+	for (int kk = 0; kk < nz; kk++)
+	{
+		REAL *pbuff_send = (REAL *)buff_send;
+		REAL *ppU = pU + kk * nx * ny;
+		// i方向收集数据准备
+		for (int n = 0; n < nx * ny; n++)
+			(*pbuff_send++) = (*ppU++);
+
+		MPI_Gatherv(buff_send, nx * ny, OCFD_DATA_TYPE, buff2, recvcounts2, displs2, OCFD_DATA_TYPE, 0, MPI_COMM_X);
+
+		if (npx == 0)
+		{
+			// j方向收集数据调序
+			for (int npx1 = 0; npx1 < NPX0; npx1++)
+			{
+				ppU = buff2 + displs2[npx1];
+
+				for (int j = 0; j < ny; j++)
+				{
+					for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
+					{
+						buff1[j][i] = (*ppU++);
+					}
+				}
+			}
+			MPI_Gatherv(buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, (REAL*)(buff2d + 1), recvcounts1, displs1, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
+		}
+		
+
+        if (npx == 0 && npy == 0){
+            MPI_File_write_at(file, displs_k, buff2d, 2*(NX_GLOBAL*NY_GLOBAL+1), MPI_INT, &status);
+		}
+
+		displs_k += 2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL);
+	}
+
+	if (npx == 0)
+	{
+		if (npy == 0)
+		{
+			free(buff2d);
+		}
+		free(buff1);
+		free(buff2);
+	}
+	free(buff_send);
+}
+
+
+//void write_3d(
+//	MPI_File file,
+//	REAL *pU)
+//{
+//    size_t displs_xy;
+//	size_t size = NX_GLOBAL*NY_GLOBAL*sizeof(REAL);
+//    size_t displs_non0_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
+//	size_t displs_non0_end = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL))*(NZ_GLOBAL-k_offset[npz]-nz);
+//
+//    REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * nx * ny);
+//    displs_xy = (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
+//
+//    if(my_id == 0){
+//        for(int i=0; i<k_offset[npz]; i++){
+//            MPI_File_write_all(file, &size, 1, MPI_INT, &status);
+//            MPI_File_seek(file, size, MPI_SEEK_CUR);		
+//            MPI_File_write_all(file, &size, 1, MPI_INT, &status);
+//        }
+//    }else{
+//        MPI_File_seek(file, displs_non0_start, MPI_SEEK_CUR);		
+//    }
+//
+//    for(int k=0; k<nz; k++){
+//    // 数据分布
+//       {
+//           REAL *ppU;
+//       	   ppU = pU + k * nx * ny;
+//           
+//           for(int j=0;j<ny;j++){
+//               for(int i=0;i<nx;i++){
+//		           *(buff_recv+j*nx+i) = *(ppU+j*nx+i);
+//               }
+//           }
+//       }
+//
+//        if(my_id == 0){
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//	    }else{
+//            MPI_File_seek(file, sizeof(int), MPI_SEEK_CUR);
+//	    }
+//
+//        MPI_File_seek(file, displs_xy, MPI_SEEK_CUR);
+//
+//        for(int j = 0; j < ny; j++){
+//            MPI_File_write_all(file, buff_recv + nx*j, nx, OCFD_DATA_TYPE, &status);
+//
+//            MPI_File_seek(file, sizeof(REAL)*(NX_GLOBAL-nx), MPI_SEEK_CUR);
+//        }
+//
+//        MPI_File_seek(file, sizeof(REAL)*((NY_GLOBAL-j_offset[npy]-ny)*NX_GLOBAL-i_offset[npx]), MPI_SEEK_CUR);
+//       
+//       	if(my_id == 0){
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//	    }else{
+//            MPI_File_seek(file, sizeof(int), MPI_SEEK_CUR);
+//	    }
+//
+//   }
+//
+//    if(my_id == 0){
+//        for(int i=0; i<(NZ_GLOBAL-k_offset[npz]-nz); i++){
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//            MPI_File_seek(file, size, MPI_SEEK_CUR);		
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//        }
+//    }else{
+//        MPI_File_seek(file, displs_non0_end, MPI_SEEK_CUR);		
+//    }
+//
+//   if (my_id == 0) printf("write 3d data ...\n");
+//
+//   free(buff_recv);
+//}
+//------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------
+//void write_3d(
+//	FILE *file,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny][nx] = PTR2ARRAY2(pU, nx, ny);
+//	REAL(*buff2d)
+//	[NX_GLOBAL], (*buff1)[NX_GLOBAL], *buff2, *buff_send;
+//
+//	int recvcounts1[NPY0], displs1[NPY0], recvcounts2[NPX0], displs2[NPX0];
+//
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			buff2d = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+//		}
+//		buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//		buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//	}
+//	buff_send = (REAL *)malloc(sizeof(REAL) * nx * ny);
+//
+//	//---------------------------------------------------------------
+//	if (my_id == 0)
+//		printf("write 3d data ...\n");
+//	// recvconts1 ， displs1 存储j方向收集时所使用的个数与偏移，
+//	// 由于j方向收集发生在i方向收集之后，因此只有一列参与j方向收集
+//	for (int j = 0; j < NPY0; j++)
+//	{
+//		recvcounts1[j] = NX_GLOBAL * j_nn[j];
+//		displs1[j] = j_offset[j] * NX_GLOBAL;
+//	}
+//	// i方向收集所需偏移与数量
+//	for (int i = 0; i < NPX0; i++)
+//	{
+//		recvcounts2[i] = ny * i_nn[i];
+//		displs2[i] = i_offset[i] * ny;
+//	}
+//
+//	// 按数据的k面进行循环
+//	int proc_k, k_local;
+//	for (int kk = 0; kk < NZ_GLOBAL; kk++)
+//	{
+//		get_k_node(kk, &proc_k, &k_local);
+//		if (npz == proc_k)
+//		{
+//			REAL *pbuff_send = (REAL *)buff_send;
+//			REAL *ppU = pU + k_local * nx * ny;
+//			// i方向收集数据准备
+//			for (int n = 0; n < nx * ny; n++)
+//				(*pbuff_send++) = (*ppU++);
+//			MPI_Gatherv(buff_send, nx * ny, OCFD_DATA_TYPE, buff2, recvcounts2, displs2, OCFD_DATA_TYPE, 0, MPI_COMM_X);
+//
+//			if (npx == 0)
+//			{
+//				// j方向收集数据调序
+//				for (int npx1 = 0; npx1 < NPX0; npx1++)
+//				{
+//					ppU = buff2 + displs2[npx1];
+//
+//					for (int j = 0; j < ny; j++)
+//					{
+//						for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
+//						{
+//							buff1[j][i] = (*ppU++);
+//						}
+//					}
+//				}
+//				MPI_Gatherv(buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, buff2d, recvcounts1, displs1, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
+//			}
+//		}
+//
+//		//
+//
+//		// k 方向收集
+//		if (proc_k != 0)
+//		{
+//			if (npx == 0 && npy == 0 && npz == proc_k)
+//			{
+//				MPI_Send(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, 666, MPI_COMM_WORLD);
+//			}
+//			if (my_id == 0)
+//			{
+//				MPI_Status status;
+//				MPI_Recv(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, proc_k * NPX0 * NPY0, 666, MPI_COMM_WORLD, &status);
+//			}
+//		}
+//		if (my_id == 0)
+//			FWRITE(buff2d, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+//	}
+//
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			free(buff2d);
+//		}
+//		free(buff1);
+//		free(buff2);
+//	}
+//	free(buff_send);
+//}
+
+//------------------------------------------------------------------------------------------------------------------
+//------------------------------------Write blockdata from 3d array-------------------------------------------------
+
+void write_blockdata(
+	FILE *file,
+	REAL *pU,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke)
+{
+	int nx1 = ie - ib + 1, ny1 = je - jb + 1, nz1 = ke - kb + 1;
+	int i, j, k, i0, j0, k0, i1, j1, k1;
+
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL U1[nz1][ny1][nx1], U0[nz1][ny1][nx1];
+	//--------------------------------
+	REAL *p = &U1[0][0][0];
+	for (int i = 0; i < nx1 * ny1 * nz1; i++)
+	{
+		(*p++) = 0.0;
+	}
+	p = &U0[0][0][0];
+	for (int i = 0; i < nx1 * ny1 * nz1; i++)
+	{
+		(*p++) = 0.0;
+	}
+
+	// 假设in文件使用fortran下标 , 从1开始
+	ib -= 1;
+	jb -= 1;
+	kb -= 1;
+
+	int gkb = k_offset[npz];
+	int gjb = j_offset[npy];
+	int gib = i_offset[npx];
+
+	for (k = 0; k < nz; k++)
+	{
+		k0 = k + gkb;
+		if (!(k0 >= kb && k0 < ke))
+			continue;
+		k1 = k0 - kb;
+		for (j = 0; j < ny; j++)
+		{
+			j0 = j + gjb;
+			if (!(j0 >= jb && j0 < je))
+				continue;
+			j1 = j0 - jb;
+			for (i = 0; i < nx; i++)
+			{
+				i0 = i + gib;
+				if (!(i0 >= ib && i0 < ie))
+					continue;
+				i1 = i0 - ib;
+				U1[k1][j1][i1] = U[k + LAP][j + LAP][i + LAP];
+			}
+		}
+	}
+	MPI_Reduce(&U1[0][0][0], &U0[0][0][0], nx1 * ny1 * nz1, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if (my_id == 0)
+		FWRITE(&U0[0][0][0], sizeof(REAL), nx1 * ny1 * nz1, file)
+}
+#ifdef __cplusplus
+}
+#endif
+//--------------------------------------------------
--- a/src_hip/OCFD_NS_Jacobian3d.cpp
+++ b/src_hip/OCFD_NS_Jacobian3d.cpp
+#include "hip/hip_runtime.h"
+#include <math.h>
+
+#include "OCFD_NS_Jacobian3d.h"
+#include "parameters.h"
+#include "OCFD_Schemes_Choose.h"
+#include "OCFD_split.h"
+
+#include "commen_kernel.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "OCFD_mpi_dev.h"
+#include "parameters_d.h"
+#include "OCFD_flux_charteric.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void du_invis_Jacobian3d_init(cudaJobPackage job_in, hipStream_t *stream){
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x+2*LAP, size.y+2*LAP, size.z+2*LAP);
+	
+	cudaJobPackage job( dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP), 
+						dim3(job_in.end.x + LAP, job_in.end.y + LAP, job_in.end.z + LAP) );
+						
+	CUDA_LAUNCH(( hipLaunchKernelGGL(sound_speed_kernel, dim3(griddim ), dim3(blockdim), 0, *stream, *pT_d , *pcc_d , job) ));
+}
+
+
+void du_invis_Jacobian3d_x(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, hipStream_t *stream){
+
+	OCFD_dx1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, job_in, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
+
+	OCFD_dx2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, job_in, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
+
+}
+
+void du_invis_Jacobian3d_y(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, hipStream_t *stream){
+
+	OCFD_dy1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d, job_in, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
+
+	OCFD_dy2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d, job_in, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
+
+}
+
+
+void du_invis_Jacobian3d_z(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, hipStream_t *stream){
+
+	OCFD_dz1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d, job_in, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
+
+	OCFD_dz2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d, job_in, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
+
+}
+
+// ========================================================
+
+void du_viscous_Jacobian3d_init(hipStream_t *stream){
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+    OCFD_dx0(*pu_d, *puk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pv_d, *pvk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pw_d, *pwk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+	OCFD_dx0(*pT_d, *pTk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+	
+    OCFD_dy0(*pu_d, *pui_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pv_d, *pvi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pw_d, *pwi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+	OCFD_dy0(*pT_d, *pTi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+	
+    OCFD_dz0(*pu_d, *pus_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pv_d, *pvs_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pw_d, *pws_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+	OCFD_dz0(*pT_d, *pTs_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+
+}
+
+
+__device__ void vis_flux_s_ker(
+	vis_flux vf,
+
+	REAL *Akx,
+	REAL *Aix,
+	REAL *Asx,
+	REAL *Aky,
+	REAL *Aiy,
+	REAL *Asy,
+	REAL *Akz,
+	REAL *Aiz,
+	REAL *Asz,
+
+	REAL *Amu,
+
+	REAL *s11,
+	REAL *s12,
+	REAL *s13,
+	REAL *s22,
+	REAL *s23,
+	REAL *s33,
+
+	int x,
+	int y,
+	int z
+){
+	REAL ux, vx, wx;
+	REAL uy, vy, wy;
+	REAL uz, vz, wz;
+	REAL div;
+
+	REAL uk = get_Field(vf.uk, x-LAP, y-LAP, z-LAP);
+	REAL ui = get_Field(vf.ui, x-LAP, y-LAP, z-LAP);
+	REAL us = get_Field(vf.us, x-LAP, y-LAP, z-LAP);
+	REAL vk = get_Field(vf.vk, x-LAP, y-LAP, z-LAP);
+	REAL vi = get_Field(vf.vi, x-LAP, y-LAP, z-LAP);
+	REAL vs = get_Field(vf.vs, x-LAP, y-LAP, z-LAP);
+	REAL wk = get_Field(vf.wk, x-LAP, y-LAP, z-LAP);
+	REAL wi = get_Field(vf.wi, x-LAP, y-LAP, z-LAP);
+	REAL ws = get_Field(vf.ws, x-LAP, y-LAP, z-LAP);
+
+
+	ux=uk* *Akx + ui* *Aix + us* *Asx;
+	vx=vk* *Akx + vi* *Aix + vs* *Asx;
+	wx=wk* *Akx + wi* *Aix + ws* *Asx;
+
+	uy=uk* *Aky + ui* *Aiy + us* *Asy;
+	vy=vk* *Aky + vi* *Aiy + vs* *Asy;
+	wy=wk* *Aky + wi* *Aiy + ws* *Asy;
+		
+	uz=uk* *Akz + ui* *Aiz + us* *Asz;
+	vz=vk* *Akz + vi* *Aiz + vs* *Asz;
+	wz=wk* *Akz + wi* *Aiz + ws* *Asz;
+
+	div=ux+vy+wz;
+			
+	*s11 = (2.0*ux-2.0/3.0*div) * *Amu;
+	*s22 = (2.0*vy-2.0/3.0*div) * *Amu;
+	*s33 = (2.0*wz-2.0/3.0*div) * *Amu;
+
+	*s12 = (uy+vx)* *Amu;
+	*s13 = (uz+wx)* *Amu;
+	*s23 = (vz+wy)* *Amu;
+}
+
+
+__device__ void vis_flux_e_ker(
+	vis_flux vf,
+
+	REAL *Amu,
+	REAL *Akx,
+	REAL *Aky,
+	REAL *Akz,
+	REAL *Aix,
+	REAL *Aiy,
+	REAL *Aiz,
+	REAL *Asx,
+	REAL *Asy,
+	REAL *Asz,
+
+	REAL *s11,
+	REAL *s12,
+	REAL *s13,
+	REAL *s22,
+	REAL *s23,
+	REAL *s33,
+
+	REAL *E1,
+	REAL *E2,
+	REAL *E3,
+
+	int x,
+	int y,
+	int z
+){
+	REAL Tx;
+	REAL Ty;
+	REAL Tz;
+	REAL Amuk;
+
+	REAL Tk = get_Field(vf.Tk, x-LAP, y-LAP, z-LAP);
+	REAL Ti = get_Field(vf.Ti, x-LAP, y-LAP, z-LAP);
+	REAL Ts = get_Field(vf.Ts, x-LAP, y-LAP, z-LAP);
+	REAL u  = get_Field_LAP(vf.u, x, y, z);
+	REAL v  = get_Field_LAP(vf.v, x, y, z);
+	REAL w  = get_Field_LAP(vf.w, x, y, z);
+
+	Amuk=*Amu * vis_flux_init_c_d;
+			
+	Tx=Tk* *Akx + Ti* *Aix + Ts* *Asx;	
+	Ty=Tk* *Aky + Ti* *Aiy + Ts* *Asy;	
+	Tz=Tk* *Akz + Ti* *Aiz + Ts* *Asz;
+
+	*E1=u* *s11 + v* *s12 + w* *s13 + Amuk*Tx;
+	*E2=u* *s12 + v* *s22 + w* *s23 + Amuk*Ty;
+	*E3=u* *s13 + v* *s23 + w* *s33 + Amuk*Tz;
+}
+
+
+__device__ void vis_flus_ev_ker(
+	vis_flux vf,
+
+	REAL *s11,
+	REAL *s12,
+	REAL *s13,
+	REAL *s22,
+	REAL *s23,
+	REAL *s33,
+
+	REAL *E1,
+	REAL *E2,
+	REAL *E3,
+
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+
+	int x,
+	int y,
+	int z
+){
+	REAL akx , aky , akz;
+	{
+		REAL Aj1;
+		Aj1 = get_Field_LAP(vf.Ajac , x,y,z);
+
+		akx = get_Field_LAP(vf.Ax, x, y, z)*Aj1;
+		aky = get_Field_LAP(vf.Ay, x, y, z)*Aj1;
+		akz = get_Field_LAP(vf.Az, x, y, z)*Aj1;
+	}
+	
+	get_Field_LAP(Ev1, x, y, z) = ( akx* *s11 + aky* *s12 + akz* *s13 );
+	get_Field_LAP(Ev2, x, y, z) = ( akx* *s12 + aky* *s22 + akz* *s23 ); 
+	get_Field_LAP(Ev3, x, y, z) = ( akx* *s13 + aky* *s23 + akz* *s33 );
+	get_Field_LAP(Ev4, x, y, z) = ( akx* *E1  + aky* *E2  + akz* *E3  );
+}
+
+
+__global__ void vis_flux_ker(
+
+	vis_flux vf,
+
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+
+	cudaJobPackage job)
+{
+	// eyes on cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+
+		REAL s11, s12, s13, s22, s23, s33;
+		REAL E1, E2, E3;
+	
+		REAL Akx = get_Field_LAP(vf.Akx, x, y, z);
+		REAL Aix = get_Field_LAP(vf.Aix, x, y, z);
+		REAL Asx = get_Field_LAP(vf.Asx, x, y, z);
+		REAL Aky = get_Field_LAP(vf.Aky, x, y, z);
+		REAL Aiy = get_Field_LAP(vf.Aiy, x, y, z);
+		REAL Asy = get_Field_LAP(vf.Asy, x, y, z);
+		REAL Akz = get_Field_LAP(vf.Akz, x, y, z);
+		REAL Aiz = get_Field_LAP(vf.Aiz, x, y, z);
+		REAL Asz = get_Field_LAP(vf.Asz, x, y, z);
+	
+		REAL Amu = get_Field(vf.Amu, x-LAP, y-LAP, z-LAP);
+
+		vis_flux_s_ker(vf,&Akx,&Aix,&Asx,&Aky,&Aiy,&Asy,&Akz,&Aiz,&Asz,&Amu,&s11,&s12,&s13,&s22,&s23,&s33,x,y,z);
+
+		vis_flux_e_ker(vf,&Amu,&Akx,&Aky,&Akz,&Aix,&Aiy,&Aiz,&Asx,&Asy,&Asz,
+			&s11,&s12,&s13,&s22,&s23,&s33,&E1,&E2,&E3,x,y,z);
+
+		vis_flus_ev_ker(vf,&s11,&s12,&s13,&s22,&s23,&s33,&E1,&E2,&E3,
+			Ev1,Ev2,Ev3,Ev4,x,y,z);
+	}
+}
+
+
+void du_viscous_Jacobian3d_x_init(hipStream_t *stream){
+
+	dim3 blockdim , griddim;
+
+    uint32_t BlockDimX1 = 8;
+    uint32_t BlockDimY1 = 4;
+    uint32_t BlockDimZ1 = 4;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+	vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
+				                   *pTk_d,*pTi_d,*pTs_d,*pAmu_d,
+		                           *pu_d,*pv_d,*pw_d,*pAkx_d,*pAky_d,*pAkz_d,
+		                           *pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
+
+	CUDA_LAUNCH(( hipLaunchKernelGGL(vis_flux_ker, dim3(griddim ), dim3(blockdim), 0, *stream, vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
+
+}
+
+void du_viscous_Jacobian3d_x_final(cudaJobPackage job_in, hipStream_t *stream){
+
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    OCFD_dx0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+
+	cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) , 
+	                   dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
+
+	int size_du = pdu_d->pitch*ny*nz;
+	cudaField tmp_du;
+	tmp_du.pitch = pdu_d->pitch;
+
+	tmp_du.ptr = pdu_d->ptr + size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_u_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_v_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_w_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_T_d, *pAjac_d, job) ));
+}
+
+void du_viscous_Jacobian3d_y_init(hipStream_t *stream){
+
+	dim3 blockdim , griddim;
+
+    uint32_t BlockDimX1 = 8;
+    uint32_t BlockDimY1 = 4;
+    uint32_t BlockDimZ1 = 4;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+	vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
+								   *pTk_d,*pTi_d,*pTs_d,*pAmu_d,
+								   *pu_d,*pv_d,*pw_d,*pAix_d,*pAiy_d,*pAiz_d,
+								   *pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
+
+    CUDA_LAUNCH(( hipLaunchKernelGGL(vis_flux_ker, dim3(griddim ), dim3(blockdim), 0, *stream, vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
+
+}
+
+void du_viscous_Jacobian3d_y_final(cudaJobPackage job_in, hipStream_t *stream){
+
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    OCFD_dy0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+	OCFD_dy0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+
+	cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) , 
+					   dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
+					   
+	int size_du = pdu_d->pitch*ny*nz;
+	cudaField tmp_du;
+	tmp_du.pitch = pdu_d->pitch;
+
+	tmp_du.ptr = pdu_d->ptr + size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_u_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_v_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_w_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_T_d, *pAjac_d, job) ));
+}
+
+
+void du_viscous_Jacobian3d_z_init(hipStream_t *stream){
+
+	dim3 blockdim , griddim;
+
+    uint32_t BlockDimX1 = 8;
+    uint32_t BlockDimY1 = 4;
+    uint32_t BlockDimZ1 = 4;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+	vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
+								   *pTk_d,*pTi_d,*pTs_d,*pAmu_d,
+								   *pu_d,*pv_d,*pw_d,*pAsx_d,*pAsy_d,*pAsz_d,
+								   *pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
+
+    CUDA_LAUNCH(( hipLaunchKernelGGL(vis_flux_ker, dim3(griddim ), dim3(blockdim), 0, *stream, vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
+
+}
+
+
+void du_viscous_Jacobian3d_z_final(cudaJobPackage job_in, hipStream_t *stream){
+
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+	OCFD_dz0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+	OCFD_dz0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+
+	cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) , 
+					   dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
+					   
+	int size_du = pdu_d->pitch*ny*nz;
+	cudaField tmp_du;
+	tmp_du.pitch = pdu_d->pitch;
+
+	tmp_du.ptr = pdu_d->ptr + size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_u_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_v_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_w_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( hipLaunchKernelGGL(YF_Pe_XF, dim3(griddim ), dim3(blockdim), 0, *stream, tmp_du, *vis_T_d, *pAjac_d, job) ));
+}
+
+__global__ void boundary_symmetry_pole_vis_y_ker_m(
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+	cudaJobPackage job){
+
+	// eyes on Bottom holo cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+		unsigned int y1 = 2*LAP - y;
+
+		get_Field_LAP(Ev1 , x,y,z) = - get_Field_LAP(Ev1 , x,y1,z);
+		get_Field_LAP(Ev2 , x,y,z) =   get_Field_LAP(Ev2 , x,y1,z);
+		get_Field_LAP(Ev3 , x,y,z) = - get_Field_LAP(Ev3 , x,y1,z);
+		get_Field_LAP(Ev4 , x,y,z) = - get_Field_LAP(Ev4 , x,y1,z);
+	}
+}
+
+__global__ void boundary_symmetry_pole_vis_y_ker_p(
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+	cudaJobPackage job){
+
+	// eyes on Top holo cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+		unsigned int y1 = 2*(ny_d+LAP-1) - y;
+
+		get_Field_LAP(Ev1 , x,y,z) = - get_Field_LAP(Ev1 , x,y1,z);
+		get_Field_LAP(Ev2 , x,y,z) =   get_Field_LAP(Ev2 , x,y1,z);
+		get_Field_LAP(Ev3 , x,y,z) = - get_Field_LAP(Ev3 , x,y1,z);
+		get_Field_LAP(Ev4 , x,y,z) = - get_Field_LAP(Ev4 , x,y1,z);
+	}
+}
+
+void boundary_symmetry_pole_vis_y(hipStream_t *stream){
+	dim3 blockdim , griddim;
+//    symmetry or pole boundary condition for viscous term
+    if(IF_SYMMETRY == 1){
+        if(npy == 0){
+		    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , LAP , BlockDimZ , nx , LAP , nz);
+		    cudaJobPackage job(dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap));
+		    CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_symmetry_pole_vis_y_ker_m, dim3(griddim ), dim3(blockdim), 0, *stream, *pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d , job) ));
+	    }
+	    if(npy == NPY0-1){
+		    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , LAP , BlockDimZ , nx , LAP , nz);
+		    cudaJobPackage job(dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap));
+		    CUDA_LAUNCH(( hipLaunchKernelGGL(boundary_symmetry_pole_vis_y_ker_p, dim3(griddim ), dim3(blockdim), 0, *stream, *pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d , job) ));
+    	}
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif