first commit

c0b0318b · ccfd · c0b0318b · c0b0318b · c0b0318b · c0b0318b
Commit c0b0318b authored Jul 21, 2022 by ccfd
20 changed files
--- a/head_hip/test.h
+++ b/head_hip/test.h
+#ifndef __TEST_H
+#define __TEST_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "parameters.h"
+void write_block_me(char * name , REAL * u , int nx , int ny , int nz);
+void write_block_me1(char * name , REAL * u , int nx , int ny , int nz);
+void write_block_me2(char * name , REAL * u , int nx , int ny , int nz);
+void write_block_me3(char * name , REAL * u , int nx , int ny , int nz);
+#endif
--- a/head_hip/utility.h
+++ b/head_hip/utility.h
+#ifndef __UTILITY_H
+#define __UTILITY_H
+#include <stdlib.h>
+#include <stdio.h>
+#include "parameters.h"
+#include "config_parameters.h"
+
+#define PTR2ARRAY2(ptr,nx,ny) (REAL(*)[ny][nx])(ptr)
+#define PTR2ARRAY3(ptr,nx,ny,nz) (REAL(*)[nz][ny][nx])(ptr)
+#define MAX(a,b) (a>b? a : b)
+#define MIN(a,b) (a<b? a : b)
+#define PROCIdx2Num(proc_i , proc_j , proc_k) (proc_i + proc_j*NPX0 + proc_k*NPX0*NPY0)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define malloc_me_Host(p, size) malloc_me_Host_((void**)&p , size ,  __FUNCTION__ , __FILE__ ,__LINE__)
+void malloc_me_Host_(void **p, int size , const char * funname , const char * file , int line);
+
+#define malloc_me(size) malloc_me_(size , __FUNCTION__ , __FILE__ ,__LINE__)
+void * malloc_me_(int size , const char * funname , const char * file , int line);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
--- a/makefile
+++ b/makefile
+#nonw
+
+HOST_NAME=$(shell hostname)
+
+SRC=$(wildcard src/*.c)
+SRC+=$(wildcard src/*.cu)
+
+HEAD=$(wildcard head/*.h)
+
+OBJ=$(patsubst src/%c,obj/%o, $(SRC) )
+OBJ:=$(patsubst src/%cu,obj/%o, $(OBJ) )
+
+
+all: default
+ifneq ($(shell which hipcc),)
+# HIP compoler
+#ifndef MPICH
+#$(error env MPICH doesn't exist , MPI_PATH has wrong value)
+#endif
+#ifndef HIPCC
+#$(error env HIP doesn't exist , DEV_PATH has wrong value)
+#endif
+
+#MPI_PATH=/opt/hpc/software/mpi/hpcx/v2.4.1/
+MPI_PATH=/opt/hpc/software/mpi/hpcx/v2.7.4/gcc-7.3.1/
+DEV_PATH=/opt/rocm/hip/
+
+
+DEV=hipcc
+HOST=mpicxx
+
+OPT_Commen=-O3
+
+OPT_Host=-c -std=c99 -I $(DEV_PATH)/include -I $(DEV_PATH)/include/hip/hcc_detail/cuda -D __HIP_PLATFORM_HCC__ -D __HIPCC__
+OPT_Host+= $(OPT_Commen)
+
+OPT_Dev=-c -I /usr/include/x86_64-linux-gnu/mpich
+OPT_Dev+=$(OPT_Commen)
+
+SRC:=$(patsubst src/%.c,src_hip/%.c, $(SRC))
+SRC:=$(patsubst src/%.cu,src_hip/%.cpp, $(SRC))
+
+HEAD:=$(patsubst head/%.h,head_hip/%.h, $(HEAD))
+
+.PRECIOUS : %.o %_hip.c %_hip.cpp %_hip.h
+
+opencfd_hip.c : opencfd.c 
+	hipify-perl $< > $@
+
+opencfd.o : opencfd_hip.c $(HEAD)
+	$(HOST) $(OPT_Host) -I head_hip/ -o opencfd.o opencfd_hip.c
+
+hip_file : $(HEAD) $(SRC)
+
+head_hip/%.h : head/%.h
+	@if [ ! -e "head_hip" ] ; then mkdir head_hip ; fi
+	hipify-perl $< > $@
+
+src_hip/%.c : src/%.c 
+	@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
+	hipify-perl $< > $@
+
+src_hip/%.cpp : src/%.cu
+	@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
+	hipify-perl $< > $@
+
+src_hip/%.cpp : ana/%.c
+	@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
+	hipify-perl $< > $@
+
+src_hip/%.cpp : ana/%.cu
+	@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
+	hipify-perl $< > $@
+
+obj/%.o : src_hip/%.c head_hip/%.h
+	@if [ ! -e "obj" ] ; then mkdir obj ; fi
+	$(HOST) $(OPT_Host) -I head_hip/ $< -o $@
+
+obj/%.o : src_hip/%.cpp head_hip/%.h
+	@if [ ! -e "obj" ] ; then mkdir obj ; fi
+	$(DEV) $(OPT_Dev) -I head_hip/ $< -o $@
+
+clean:
+	rm -f *.o *.out obj/*.o obj/*.a src_hip/*  head_hip/* opencfd_hip.c
+
+else
+#nvcc compiler
+
+MPI_PATH=/usr/
+
+#MPI_PATH=/home/dglin/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/
+DEV_PATH=/usr/local/cuda-11.6/
+
+#ifndef MPICH
+#$(error env MPICH doesn't exist , MPI_PATH has wrong value)
+#endif
+#ifndef CUDA
+#$(error env CUDA doesn't exist , DEV_PATH has wrong value)
+#endif
+#
+#
+#MPI_PATH=$(MPICH)
+#DEV_PATH=$(CUDA)
+
+DEV=nvcc
+HOST=$(MPI_PATH)/bin/mpicc
+
+OPT_Commen=-g
+
+OPT_Host=-c -std=c99 -I $(DEV_PATH)/include
+OPT_Host+= $(OPT_Commen)
+
+OPT_Dev=-dc -I /usr/include/x86_64-linux-gnu/mpich
+OPT_Dev+=$(OPT_Commen) -G -code=sm_75 -arch=compute_75
+
+
+
+
+opencfd.o : opencfd.c
+	$(HOST) $(OPT_Host) -I head/ -o opencfd.o opencfd.c
+
+obj/%.o : src/%.c head/%.h
+	@if [ ! -e "obj" ] ; then mkdir obj ; fi
+	$(HOST) $(OPT_Host) -I head/ $< -o $@
+
+obj/%.o : src/%.cu head/%.h
+	@if [ ! -e "obj" ] ; then mkdir obj ; fi
+	$(DEV) $(OPT_Dev) -I head/ $< -o $@
+
+
+clean:
+	rm -f *.o *.out obj/*.o obj/*.a
+
+endif
+
+
+default : opencfd.o obj/libocfd.a
+	$(DEV) -O3 -o opencfd-scu.out opencfd.o -L obj -locfd -L $(MPI_PATH)/lib -lmpi -lm -lpthread
+
+
+obj/libocfd.a : $(OBJ)
+	ar -crv obj/libocfd.a $(OBJ)
+
+
+ZIP_EXIST=0
+zip : 
+	@if [ -e "src_cuda.zip" ] ; then rm src_cuda.zip ; echo "rm src_cuda.zip"; fi
+	zip --quiet -r src_cuda.zip head/ src/ test/ opencfd.c makefile README
+
+echo:
+	@echo $(HOST_NAME)
+	@echo $(SRC)
+	@echo $(OBJ)
+	@echo $(HEAD)
+	@echo $(A)
--- a/opencfd-scu.in
+++ b/opencfd-scu.in
+#OpenCFD-SCU-Ver2.00 input file， Dglin， 2021-05 
+GRID_3D = 25   240   20
+PARALLEL_3D = 1   1   1
+STREAM = 1
+CHARTERIC = 1
+TEST = 0
+
+IPERIODIC = 0   0   1
+JAC_BOUND = 1   1   1
+DIF_BOUND = 1   1   1   1   0   0
+NON_REFLETION = 0   1   0   1   0   0
+
+SCHEME_INVIS = WENO7_SYMBO
+SCHEME_VIS = CD8
+#SCHEME_INVIS = SCHEME_HYBRIDAUTO
+#HY_DP_INTV = 1.0 8.0
+#HY_STYLE = 1
+#HY_SMOOTH_DP = 1
+#HY_PATCH_ZONE = 1
+#HY_ZONE0 = 10   25   100   240   5   20   20.0
+
+
+RE = 5581.4
+AMA = 2.9
+GAMMA = 1.40
+PR = 0.70
+T_REF = 108.1
+EPSL_SW = 0.0
+
+DT = 0.01
+END_TIME = 2000
+KSTEP_SHOW = 1
+KSTEP_SAVE = 10000
+INIT_STAT = 1
+
+IBC = 108
+#mzmax, mtmax, Inlet_boundary, If_wall_not_normal
+BC_NPARA = 10   5   1   0
+
+#Tw, epsl, x_dis_begin, x_dis_end, beta, x_wall_begin, x_up_bound_begin, SLZ
+BC_RPARA = 2.84   0.2   -320.   -300   0.1   -400.   -50.   14.  
+
+
+#nstep_filter, Filter_X, Filter_Y, Filter_Z, ib, ie, jb, je, kb, ke, Filter_scheme  // s0, rth, Filter_end_time
+#FILTER_NPARA0 = 100   1   1   1   0   25   0   240   0   20   2
+#FILTER_RPARA0 = 1.0   1.e-5   1000000
+
+
+#ANA_EVENT0 = 100   10
+#ANA_NPARA0 = 0
+#ANA_RPARA0 = 0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#*****************************************************************************************************
+# IBC  nk   nr  IF_symmetry  IF_withleading   Iflag_upperboundary    AoA  Tw     epsl_wall epsl_upper  wall_dis_begin wall_dis_end  //(liftbody)
+  124   3    6        1              0                 1               0.  3.797     0.02        0.          50.           60.
+
+
--- a/opencfd.c
+++ b/opencfd.c
+//----------------------------------------------------------------------------------------------------------------------------------------   
+// OpenCFD-SC  ,  3-D compressible Navier-Stokes Finite difference Solver 
+// Copyright by LI Xinliang, LHD, Institute of Mechanics, CAS, Email: lixl@imech.ac.cn
+//  
+// The default code is double precision computation
+// If you want to use SINGLE PRECISION computation, you can change   "OCFD_REAL_KIND=8"  to "OCFD_REAL_KIND=4" ,
+// and  "OCFD_DATA_TYPE=MPI_DOUBLE_PRECISION" to "OCFD_DATA_TYPE=MPI_REAL" in the file OpenCFD.h 
+//---------------------------------------------------------------------------------------------------------------------------------------------- 
+#include <stdlib.h>
+#include <stdio.h>
+#include "mpi.h"
+
+#include "utility.h"
+#include "parameters.h"
+
+#include "OCFD_NS_Solver.h"
+#include "OCFD_mpi.h"
+#include "OCFD_init.h"
+#include "cuda_commen.h"
+#include "OCFD_mpi_dev.h"
+#include "OCFD_filtering.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+int main(int argc, char *argv[]){
+    mpi_init(&argc , &argv);
+
+    read_parameters();
+
+    opencfd_mem_init_mpi();  
+
+    part();
+
+    set_para_filtering();
+    
+    opencfd_mem_init_all();
+
+    cuda_commen_init();
+
+    init();
+
+    NS_solver_real();
+
+    opencfd_mem_finalize_all();
+
+    mpi_finalize();
+    
+    return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/opencfd.log
+++ b/opencfd.log
+##################################################################################################
+OpenCFD-SCU-V1.00 CopyRight by Li-Xinliang, LHD, Institute of Mechanics, CAS (lixl@imech.ac.cn)
+Coded by Liu-Shiwei, ICMSEC, Academy of Mathematics and Systems Science, CAS (liusw@lsec.cc.ac.cn)
+Coded by Dang-Guanlin, LHD, Institute of Mechanics, CAS (dangguanlin@imech.ac.cn) 2020-01
+
+Mesh(Nx,Ny,Nz): (250,240,90)
+3D Partation: 1*1*1   Total procs=1
+Re=1.000000 , Ma=8.000000 , Gamma=1.000000 , dt=1950.000000
+Start Computing ......
+##################################################################################################
+OpenCFD-SCU-V1.00 CopyRight by Li-Xinliang, LHD, Institute of Mechanics, CAS (lixl@imech.ac.cn)
+Coded by Liu-Shiwei, ICMSEC, Academy of Mathematics and Systems Science, CAS (liusw@lsec.cc.ac.cn)
+Coded by Dang-Guanlin, LHD, Institute of Mechanics, CAS (dangguanlin@imech.ac.cn) 2020-01
+Mesh(Nx,Ny,Nz): (25,240,20)
+3D Partation: 1*1*1   Total procs=1
+Re=5581.400000 , Ma=2.900000 , Gamma=1.400000 , dt=0.010000
+Start Computing ......
--- a/opencfd_hip.c
+++ b/opencfd_hip.c
+//----------------------------------------------------------------------------------------------------------------------------------------   
+// OpenCFD-SC  ,  3-D compressible Navier-Stokes Finite difference Solver 
+// Copyright by LI Xinliang, LHD, Institute of Mechanics, CAS, Email: lixl@imech.ac.cn
+//  
+// The default code is double precision computation
+// If you want to use SINGLE PRECISION computation, you can change   "OCFD_REAL_KIND=8"  to "OCFD_REAL_KIND=4" ,
+// and  "OCFD_DATA_TYPE=MPI_DOUBLE_PRECISION" to "OCFD_DATA_TYPE=MPI_REAL" in the file OpenCFD.h 
+//---------------------------------------------------------------------------------------------------------------------------------------------- 
+#include <stdlib.h>
+#include <stdio.h>
+#include "mpi.h"
+
+#include "utility.h"
+#include "parameters.h"
+
+#include "OCFD_NS_Solver.h"
+#include "OCFD_mpi.h"
+#include "OCFD_init.h"
+#include "cuda_commen.h"
+#include "OCFD_mpi_dev.h"
+#include "OCFD_filtering.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+int main(int argc, char *argv[]){
+    mpi_init(&argc , &argv);
+
+    read_parameters();
+
+    opencfd_mem_init_mpi();  
+
+    part();
+
+    set_para_filtering();
+    
+    opencfd_mem_init_all();
+
+    cuda_commen_init();
+
+    init();
+
+    NS_solver_real();
+
+    opencfd_mem_finalize_all();
+
+    mpi_finalize();
+    
+    return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/src/OCFD_Comput_Jacobian3d.cu
+++ b/src/OCFD_Comput_Jacobian3d.cu
+/*--------- This code runs only at the initial times -----------------------
+读入计算网格 (Axx, Ayy, Azz),  计算Jocaiban 系数；  
+该程序仅在初始化阶段运行 
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_Comput_Jacobian3d.h"
+#include "OCFD_Schemes_Choose.h"
+#include "OCFD_mpi.h"
+#include "OCFD_IO.h"
+
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "OCFD_mpi_dev.h"
+#include "commen_kernel.h"
+#include "math.h"
+#include "OCFD_ana.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+void Init_Jacobian3d()
+{
+    //    init with unit
+
+    cuda_mem_value_init_warp(1.0 , pAjac_d->ptr , pAjac_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAxx_d->ptr , pAxx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAyy_d->ptr , pAyy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAzz_d->ptr , pAzz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAkx_d->ptr , pAkx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAky_d->ptr , pAky_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAkz_d->ptr , pAkz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAix_d->ptr , pAix_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAiy_d->ptr , pAiy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAiz_d->ptr , pAiz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAsx_d->ptr , pAsx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAsy_d->ptr , pAsy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    cuda_mem_value_init_warp(1.0 , pAsz_d->ptr , pAsz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
+    {
+        REAL * tmp;
+        int tmp_size = (nx+2*LAP)*(ny+2*LAP)*(nz+2*LAP);
+        tmp = pAxx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAyy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAzz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAkx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAky ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAkz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAix ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAiy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAiz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAsx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAsy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAsz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+        tmp = pAjac ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
+    }
+    // -------------------------------------------------------------------------
+    char filename1[100];
+    MPI_File tmp_file;
+    sprintf(filename1, "OCFD3d-Jacobi.dat");
+
+    if(Init_stat == 0){
+        int i,j,k;
+        int klap , jlap,ilap;
+        int i_off , j_off , k_off;
+        int i_real, j_real, k_real;
+        REAL r , d_r;
+        REAL theta , d_theta , theta_0;
+
+        REAL r0 = 1.0;
+        REAL dr = 1.0;
+        d_theta = PI / NY_GLOBAL;
+        theta_0 = -PI*0.5;
+        d_r = dr / NZ_GLOBAL;
+
+        i_off = i_offset[npx];
+        j_off = j_offset[npy];
+        k_off = k_offset[npz];
+        for(k = 0;k<nz;k++){
+            klap = k+LAP;
+            k_real = k + k_off;
+            r = r0 + d_r * k_real;
+            for(j=0;j<ny;j++){
+                jlap = j+LAP;
+                j_real = j + j_off;
+                theta = theta_0 + d_theta * j_real;
+                for(i=0;i<nx;i++){
+                    ilap = i+LAP;
+                    i_real = i + i_off;
+                    *(pAxx + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = i_real * hx;
+
+                    *(pAyy + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r * cos(theta);
+                    *(pAzz + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r * sin(theta);
+                }
+            }
+        }
+        if(npy == NPY0 - 1){
+            jlap = ny - 1 + LAP;
+            for(k = 0; k<nz ; k++){
+                klap = k+LAP;
+                k_real = k + k_off;
+                r = r0 + d_r * k_real;
+                for(i=0;i<nx;i++){
+                    ilap = i+LAP;
+                    *(pAyy + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = 0.0;
+                    *(pAzz + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r;
+                }
+            }
+        }
+    }else if(access(filename1, F_OK) == -1){ 
+        if(my_id == 0) printf("read 3D mesh data: OCFD3d-Mesh.dat ...\n");
+        
+        MPI_File_open(MPI_COMM_WORLD, "OCFD3d-Mesh.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
+        
+        MPI_Offset offset = 0;
+
+        read_3d1(tmp_file, offset, pAxx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAyy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAzz);
+    
+        MPI_File_close(&tmp_file);
+
+        exchange_boundary_xyz(pAxx);
+        exchange_boundary_xyz(pAyy);
+        exchange_boundary_xyz(pAzz);
+        
+        memcpy_All(pAxx , pAxx_d->ptr , pAxx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAyy , pAyy_d->ptr , pAyy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAzz , pAzz_d->ptr , pAzz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+        Comput_Jacobian3d();
+    }else{
+        //The file not exist
+        if(my_id == 0) printf("OCFD3d-Jacobi.dat is exit\nread 3D Jacobi data ...... ");
+        MPI_File tmp_file;
+        
+        MPI_File_open(MPI_COMM_WORLD, "OCFD3d-Jacobi.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
+        
+        MPI_Offset offset = 0;
+
+        read_3d1(tmp_file, offset, pAxx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAyy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAzz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAkx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAky);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAkz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAix);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAiy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAiz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAsx);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAsy);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAsz);
+        offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+        read_3d1(tmp_file, offset, pAjac);
+        
+    
+        MPI_File_close(&tmp_file);
+
+        exchange_boundary_xyz(pAxx);
+        exchange_boundary_xyz(pAyy);
+        exchange_boundary_xyz(pAzz);
+        exchange_boundary_xyz(pAkx);
+        exchange_boundary_xyz(pAky);
+        exchange_boundary_xyz(pAkz);
+        exchange_boundary_xyz(pAix);
+        exchange_boundary_xyz(pAiy);
+        exchange_boundary_xyz(pAiz);
+        exchange_boundary_xyz(pAsx);
+        exchange_boundary_xyz(pAsy);
+        exchange_boundary_xyz(pAsz);
+        exchange_boundary_xyz(pAjac);
+    
+        memcpy_All(pAxx , pAxx_d->ptr , pAxx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAyy , pAyy_d->ptr , pAyy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAzz , pAzz_d->ptr , pAzz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+        memcpy_All(pAkx , pAkx_d->ptr , pAkx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAky , pAky_d->ptr , pAky_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAkz , pAkz_d->ptr , pAkz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAix , pAix_d->ptr , pAix_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAiy , pAiy_d->ptr , pAiy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAiz , pAiz_d->ptr , pAiz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAsx , pAsx_d->ptr , pAsx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAsy , pAsy_d->ptr , pAsy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAsz , pAsz_d->ptr , pAsz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+        memcpy_All(pAjac , pAjac_d->ptr , pAjac_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+
+    }
+
+    ana_Jac();
+}
+
+
+void Comput_Jacobian3d(){
+
+    boundary_Jac3d_Axx(); //only using the  boudary condition for Axx, Ayy, Azz
+
+    if (my_id == 0)
+        printf("Comput Jacobian 3D data ...\n");
+    
+    comput_Jac3d();
+    if (my_id == 0)
+        printf("Comput Jacobian 3D data OK\n");
+
+    // ---------------
+    exchange_boundary_xyz_packed_dev(pAkx  , pAkx_d);
+    exchange_boundary_xyz_packed_dev(pAky  , pAky_d);
+    exchange_boundary_xyz_packed_dev(pAkz  , pAkz_d);
+    exchange_boundary_xyz_packed_dev(pAix  , pAix_d);
+    exchange_boundary_xyz_packed_dev(pAiy  , pAiy_d);
+    exchange_boundary_xyz_packed_dev(pAiz  , pAiz_d);
+    exchange_boundary_xyz_packed_dev(pAsx  , pAsx_d);
+    exchange_boundary_xyz_packed_dev(pAsy  , pAsy_d);
+    exchange_boundary_xyz_packed_dev(pAsz  , pAsz_d);
+    exchange_boundary_xyz_packed_dev(pAjac , pAjac_d);
+
+    boundary_Jac3d_Liftbody_Ajac(); //boudary condition for Axx, Ayy, Azz, Aix, Aiy, Aiz , ......
+
+}
+
+// ----------------------------------------------------------------------------
+
+__global__ void comput_Jac3d_kernal(
+    cudaField xi,
+    cudaField xj,
+    cudaField xk,
+    cudaField yi,
+    cudaField yj,
+    cudaField yk,
+    cudaField zi,
+    cudaField zj,
+    cudaField zk,
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    cudaJobPackage job
+){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL xi1, xj1, xk1, yi1, yj1, yk1, zi1, zj1, zk1, Jac1;
+        xi1 = get_Field(xi, x, y, z);
+        xj1 = get_Field(xj, x, y, z);
+        xk1 = get_Field(xk, x, y, z);
+        yi1 = get_Field(yi, x, y, z);
+        yj1 = get_Field(yj, x, y, z);
+        yk1 = get_Field(yk, x, y, z);
+        zi1 = get_Field(zi, x, y, z);
+        zj1 = get_Field(zj, x, y, z);
+        zk1 = get_Field(zk, x, y, z);
+        Jac1 = 1.0 / (xi1 * yj1 * zk1 + yi1 * zj1 * xk1 + zi1 * xj1 * yk1 - zi1 * yj1 * xk1 - yi1 * xj1 * zk1 - xi1 * zj1 * yk1); //1./Jocabian = d(x,y,z)/d(i,j,k)
+        get_Field_LAP(Ajac , x+LAP , y+LAP , z+LAP) = Jac1;
+        get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP) = (yj1 * zk1 - zj1 * yk1);
+        get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP) = (zj1 * xk1 - xj1 * zk1);
+        get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP) = (xj1 * yk1 - yj1 * xk1);
+        get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP) = (yk1 * zi1 - zk1 * yi1);
+        get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP) = (zk1 * xi1 - xk1 * zi1);
+        get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP) = (xk1 * yi1 - yk1 * xi1);
+        get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP) = (yi1 * zj1 - zi1 * yj1);
+        get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP) = (zi1 * xj1 - xi1 * zj1);
+        get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP) = (xi1 * yj1 - yi1 * xj1);
+
+
+        if(x == 0){
+            for(int i = 0; i < LAP; i++){
+                get_Field_LAP(Ajac, i, y+LAP, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx, i, y+LAP, z+LAP) =  get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky, i, y+LAP, z+LAP) =  get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz, i, y+LAP, z+LAP) =  get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix, i, y+LAP, z+LAP) =  get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy, i, y+LAP, z+LAP) =  get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz, i, y+LAP, z+LAP) =  get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx, i, y+LAP, z+LAP) =  get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy, i, y+LAP, z+LAP) =  get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz, i, y+LAP, z+LAP) =  get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(x == job.end.x-1){
+            for(int i = 1; i <= LAP; i++){
+                get_Field_LAP(Ajac, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(y == 0){
+            for(int i = 0; i < LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, i, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx, x+LAP, i, z+LAP) =  get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky, x+LAP, i, z+LAP) =  get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz, x+LAP, i, z+LAP) =  get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix, x+LAP, i, z+LAP) =  get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy, x+LAP, i, z+LAP) =  get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz, x+LAP, i, z+LAP) =  get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx, x+LAP, i, z+LAP) =  get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy, x+LAP, i, z+LAP) =  get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz, x+LAP, i, z+LAP) =  get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+
+        if(y == job.end.y-1){
+            for(int i = 1; i <= LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(z == 0){
+            for(int i = 0; i < LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, y+LAP, i) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP, y+LAP, i) =  get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP, y+LAP, i) =  get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP, y+LAP, i) =  get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP, y+LAP, i) =  get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP, y+LAP, i) =  get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP, y+LAP, i) =  get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP, y+LAP, i) =  get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP, y+LAP, i) =  get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP, y+LAP, i) =  get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+
+        if(z == job.end.z-1){
+            for(int i = 1; i <= LAP; i++){
+                get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
+                get_Field_LAP(Akx,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aky,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Akz,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aix,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiy,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Aiz,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asx,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asy,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+                get_Field_LAP(Asz,  x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+            }
+        }
+	}
+}
+
+void comput_Jac3d()
+{
+    cudaField xi; xi.ptr = puk_d->ptr; xi.pitch = puk_d->pitch;
+    cudaField xj; xj.ptr = pui_d->ptr; xj.pitch = pui_d->pitch;
+    cudaField xk; xk.ptr = pus_d->ptr; xk.pitch = pus_d->pitch;
+    cudaField yi; yi.ptr = pvk_d->ptr; yi.pitch = pvk_d->pitch;
+    cudaField yj; yj.ptr = pvi_d->ptr; yj.pitch = pvi_d->pitch;
+    cudaField yk; yk.ptr = pvs_d->ptr; yk.pitch = pvs_d->pitch;
+    cudaField zi; zi.ptr = pwk_d->ptr; zi.pitch = pwk_d->pitch;
+    cudaField zj; zj.ptr = pwi_d->ptr; zj.pitch = pwi_d->pitch;
+    cudaField zk; zk.ptr = pws_d->ptr; zk.pitch = pws_d->pitch;
+
+	cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+    
+    OCFD_dx0_jac(*pAxx_d, xi, job, BlockDim_X, &Stream[0], Jacbound[0]);
+    OCFD_dx0_jac(*pAyy_d, yi, job, BlockDim_X, &Stream[0], Jacbound[0]);
+    OCFD_dx0_jac(*pAzz_d, zi, job, BlockDim_X, &Stream[0], Jacbound[0]);
+    OCFD_dy0_jac(*pAxx_d, xj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
+    OCFD_dy0_jac(*pAyy_d, yj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
+    OCFD_dy0_jac(*pAzz_d, zj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
+    OCFD_dz0_jac(*pAxx_d, xk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
+    OCFD_dz0_jac(*pAyy_d, yk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
+    OCFD_dz0_jac(*pAzz_d, zk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
+
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX/8 , BlockDimY , BlockDimZ , nx,ny,nz);
+    job.setup( dim3(0,0,0) , dim3(nx,ny,nz) );
+
+    CUDA_LAUNCH(( comput_Jac3d_kernal<<<griddim , blockdim>>>(xi,xj,xk,yi,yj,yk,zi,zj,zk,*pAkx_d,
+            *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,job) ));
+
+}
+
+// ------------------------------------------------------------------------
+// Symmetry bounary at j=1 & j=ny_global
+
+__global__ void boundary_Jac3d_kernal_y_r(cudaField pA, REAL value, cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(pA, x, y, z) =  value*get_Field_LAP(pA, x, 2*(ny_lap_d-1) - y, z);
+    }
+}
+
+__global__ void boundary_Jac3d_kernal_y_l(cudaField pA, REAL value, cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(pA, x, y, z) =  value*get_Field_LAP(pA, x, 2*LAP - y, z);
+    }
+}
+
+
+__global__ void boundary_Jac3d_kernal_y_ramp_wall_kernel(
+    cudaField xx, 
+    cudaField yy, 
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    REAL seta,
+    cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        if( get_Field_LAP(xx, x, LAP, z) <= 0.0){
+            get_Field_LAP(Ajac, x, y, z) = get_Field_LAP(Ajac, x, 2*LAP-y, z);
+            get_Field_LAP(Akx, x, y, z) =  get_Field_LAP(Akx, x, 2*LAP-y, z);
+            get_Field_LAP(Aky, x, y, z) = -get_Field_LAP(Aky, x, 2*LAP-y, z);
+            get_Field_LAP(Akz, x, y, z) =  get_Field_LAP(Akz, x, 2*LAP-y, z);
+            get_Field_LAP(Aix, x, y, z) = -get_Field_LAP(Aix, x, 2*LAP-y, z);
+            get_Field_LAP(Aiy, x, y, z) =  get_Field_LAP(Aiy, x, 2*LAP-y, z);
+            get_Field_LAP(Aiz, x, y, z) = -get_Field_LAP(Aiz, x, 2*LAP-y, z);
+            get_Field_LAP(Asx, x, y, z) =  get_Field_LAP(Asx, x, 2*LAP-y, z);
+            get_Field_LAP(Asy, x, y, z) = -get_Field_LAP(Asy, x, 2*LAP-y, z);
+            get_Field_LAP(Asz, x, y, z) =  get_Field_LAP(Asz, x, 2*LAP-y, z);
+
+        }else{
+
+            REAL dx = get_Field_LAP(xx, x, 2*LAP-y, z) - get_Field_LAP(xx, x, 2*LAP-y-1, z);
+            REAL dy = get_Field_LAP(yy, x, 2*LAP-y, z) - get_Field_LAP(yy, x, 2*LAP-y-1, z);
+
+            REAL tmpxx = fabs(-cos(2*seta) + sin(2*seta)*dy/dx);
+            REAL tmpyy = fabs(cos(2*seta) + dx/dy*sin(2*seta));
+
+            get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpyy*get_Field_LAP(Ajac, x, 2*LAP-y, z);
+            get_Field_LAP(Akx, x, y, z) =  tmpyy*get_Field_LAP(Akx, x, 2*LAP-y, z);
+            get_Field_LAP(Aky, x, y, z) = -tmpxx*get_Field_LAP(Aky, x, 2*LAP-y, z);
+            get_Field_LAP(Akz, x, y, z) =  tmpxx*tmpyy*get_Field_LAP(Akz, x, 2*LAP-y, z);
+            get_Field_LAP(Aix, x, y, z) = -tmpyy*get_Field_LAP(Aix, x, 2*LAP-y, z);
+            get_Field_LAP(Aiy, x, y, z) =  tmpxx*get_Field_LAP(Aiy, x, 2*LAP-y, z);
+            get_Field_LAP(Aiz, x, y, z) = -tmpxx*tmpyy*get_Field_LAP(Aiz, x, 2*LAP-y, z);
+            get_Field_LAP(Asx, x, y, z) =  tmpyy*get_Field_LAP(Asx, x, 2*LAP-y, z);
+            get_Field_LAP(Asy, x, y, z) = -tmpxx*get_Field_LAP(Asy, x, 2*LAP-y, z);
+            get_Field_LAP(Asz, x, y, z) =  tmpxx*tmpyy*get_Field_LAP(Asz, x, 2*LAP-y, z);
+
+        }
+    }
+}
+
+void boundary_Jac3d_kernal_y_ramp_wall(REAL seta){
+    if (npy == 0)
+    {
+        seta = seta/PI;
+        dim3 griddim , blockdim;
+        cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
+
+        CUDA_LAUNCH(( boundary_Jac3d_kernal_y_ramp_wall_kernel<<<griddim , blockdim>>>(*pAxx_d,*pAyy_d,*pAkx_d,
+            *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,seta,job) ));
+    }
+}
+
+__global__ void boundary_Jac3d_kernal_z_cone_wall_kernel(
+    cudaField xx, 
+    cudaField zz, 
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    REAL seta1,
+    REAL seta2,
+    cudaJobPackage job){
+    unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
+    unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
+
+    if( x < job.end.x && y < job.end.y && z < job.end.z){
+        if( get_Field_LAP(xx, x, y, LAP) <= 0.0){
+
+            REAL dx = get_Field_LAP(xx, x, y, 2*LAP-z) - get_Field_LAP(xx, x, y, 2*LAP-z-1);
+            REAL dz = get_Field_LAP(zz, x, y, 2*LAP-z) - get_Field_LAP(zz, x, y, 2*LAP-z-1);
+
+            REAL tmpxx = fabs(-cos(2*seta1) + sin(2*seta1)*dz/dx);
+            REAL tmpzz = fabs(cos(2*seta1) + dx/dz*sin(2*seta1));
+
+            get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Ajac, x, y, 2*LAP-z);
+            get_Field_LAP(Akx, x, y, z) =  tmpzz*get_Field_LAP(Akx, x, y, 2*LAP-z);
+            get_Field_LAP(Aky, x, y, z) = -tmpxx*tmpzz*get_Field_LAP(Aky, x, y, 2*LAP-z);
+            get_Field_LAP(Akz, x, y, z) = -tmpxx*get_Field_LAP(Akz, x, y, 2*LAP-z);
+            get_Field_LAP(Aix, x, y, z) = -tmpzz*get_Field_LAP(Aix, x, y, 2*LAP-z);
+            get_Field_LAP(Aiy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Aiy, x, y, 2*LAP-z);
+            get_Field_LAP(Aiz, x, y, z) =  tmpxx*get_Field_LAP(Aiz, x, y, 2*LAP-z);
+            get_Field_LAP(Asx, x, y, z) = -tmpzz*get_Field_LAP(Asx, x, y, 2*LAP-z);
+            get_Field_LAP(Asy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Asy, x, y, 2*LAP-z);
+            get_Field_LAP(Asz, x, y, z) =  tmpxx*get_Field_LAP(Asz, x, y, 2*LAP-z);
+            
+        }else{
+
+            REAL dx = get_Field_LAP(xx, x, y, 2*LAP-z) - get_Field_LAP(xx, x, y, 2*LAP-z-1);
+            REAL dz = get_Field_LAP(zz, x, y, 2*LAP-z) - get_Field_LAP(zz, x, y, 2*LAP-z-1);
+
+            REAL tmpxx = fabs(-cos(2*(seta1+seta2)) + sin(2*(seta1+seta2))*dz/dx);
+            REAL tmpzz = fabs(cos(2*(seta1+seta2)) + dx/dz*sin(2*(seta1+seta2)));
+
+            get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Ajac, x, y, 2*LAP-z);
+            get_Field_LAP(Akx, x, y, z) =  tmpzz*get_Field_LAP(Akx, x, y, 2*LAP-z);
+            get_Field_LAP(Aky, x, y, z) = -tmpxx*tmpzz*get_Field_LAP(Aky, x, y, 2*LAP-z);
+            get_Field_LAP(Akz, x, y, z) = -tmpxx*get_Field_LAP(Akz, x, y, 2*LAP-z);
+            get_Field_LAP(Aix, x, y, z) = -tmpzz*get_Field_LAP(Aix, x, y, 2*LAP-z);
+            get_Field_LAP(Aiy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Aiy, x, y, 2*LAP-z);
+            get_Field_LAP(Aiz, x, y, z) =  tmpxx*get_Field_LAP(Aiz, x, y, 2*LAP-z);
+            get_Field_LAP(Asx, x, y, z) = -tmpzz*get_Field_LAP(Asx, x, y, 2*LAP-z);
+            get_Field_LAP(Asy, x, y, z) =  tmpxx*tmpzz*get_Field_LAP(Asy, x, y, 2*LAP-z);
+            get_Field_LAP(Asz, x, y, z) =  tmpxx*get_Field_LAP(Asz, x, y, 2*LAP-z);
+        }
+    }
+}
+
+void boundary_Jac3d_kernal_z_cone_wall(REAL seta1, REAL seta2){
+    if (npz == 0)
+    {
+        seta1 = seta1/PI;
+        seta2 = seta2/PI;
+        dim3 griddim , blockdim;
+        cudaJobPackage job( dim3(LAP, LAP, 0) , dim3(nx_lap, ny_lap, LAP) );
+
+        CUDA_LAUNCH(( boundary_Jac3d_kernal_z_cone_wall_kernel<<<griddim , blockdim>>>(*pAxx_d,*pAzz_d,*pAkx_d,
+            *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,seta1,seta2,job) ));
+    }
+}
+
+void boundary_Jac3d_Axx()
+{
+    if(IF_SYMMETRY == 1){
+         if (npy == 0)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAxx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAyy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAzz_d , 1.0 , job) ));
+        }
+    
+        if (npy == NPY0 - 1)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAxx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAyy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAzz_d , 1.0 , job) ));
+        }
+    }
+}
+
+
+
+void boundary_Jac3d_Liftbody_Ajac()
+{
+    if(IF_SYMMETRY == 1){
+        if (npy == 0)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAkx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAky_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAkz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAix_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAiy_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAiz_d ,-1.0 , job) ));
+    
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAsx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAsy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAsz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAjac_d , 1.0 , job) ));
+        }
+    
+        if (npy == NPY0 - 1)
+        {
+            dim3 griddim , blockdim;
+            cudaJobPackage job( dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap) );
+            cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAkx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAky_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAkz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAix_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAiy_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAiz_d ,-1.0 , job) ));
+    
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAsx_d , 1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAsy_d ,-1.0 , job) ));
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAsz_d , 1.0 , job) ));
+    
+            CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAjac_d , 1.0 , job) ));
+        }
+    }
+}
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_IO.c
+++ b/src/OCFD_IO.c
+//Read & save file
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "mpi.h"
+
+#include "OCFD_ana.h"
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_IO.h"
+#include "OCFD_IO_mpi.h"
+#include "io_warp.h"
+
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "commen_kernel.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+void read_file(
+	int Iflag_av,
+	REAL * pd,
+	REAL * pu,
+	REAL * pv,
+	REAL * pw,
+	REAL * pT)
+{
+
+	// Iflag_av == 0 , read opencfd data file; ==1, read averaged file
+	int Irestart_step;
+	char filename1[100];
+	//-----------------------------------------------------------
+    if(Iflag_av == 0){
+	    Irestart_step = -1;
+	    if (my_id == 0)
+	    {
+	    	FILE *tmp_file;
+	    	if (tmp_file = fopen("Opencfd.msg", "r"))
+	    	{
+	    		fread(&Irestart_step, sizeof(int), 1, tmp_file);
+	    		fclose(tmp_file);
+	    	}
+	    	else
+	    	{
+	    		printf("Opencfd.msg is not exist, read initial file : opencfd.dat ......\n");
+	    	}
+	    }
+    
+	    MPI_Bcast(&Irestart_step, 1, MPI_INT, 0, MPI_COMM_WORLD);
+	    if (Irestart_step < 0)
+	    {
+	    	sprintf(filename1, "opencfd.dat");
+	    }
+	    else
+	    {
+	    	sprintf(filename1, "OCFD%08d.dat", Irestart_step);
+	    }
+	    MPI_File tmp_file;
+	    int tmp[3];
+	    
+	    if(my_id == 0) printf("read initial data file: %s \n\n", filename1);
+	    MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
+        
+	    MPI_File_read_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+        MPI_File_read_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+
+		MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
+	
+	    Istep = tmp[0];
+	    tt = *(REAL*)(tmp+1);
+	    if(my_id == 0) printf("Istep=%d , tt=%lf\n", Istep, tt);
+
+	    read_3d1(tmp_file, offset, pd);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pu);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pv);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pw);
+		offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	    read_3d1(tmp_file, offset, pT);
+
+        MPI_File_close(&tmp_file);
+	    //------------------------
+	    if (my_id == 0)
+	    	printf("read data ok\n");
+    }
+
+	//--------------------
+	if(Iflag_av == 1)
+	{
+	    // averaged file
+	    	//char *tmp_char = strstr(filename1, ".dat");
+
+		sprintf(filename1, "opencfd.average");
+
+		if (access(filename1, F_OK) == -1){ 
+
+			//The file not exist
+      		if(my_id == 0) printf("Average file: %s is not exit\n\n", filename1);
+      		Istep_average = 0;
+      		tt_average = 0.0;
+
+            init_time_average();
+     	}else{
+	        if (my_id == 0)
+			printf("read average_data begin\n");
+
+			MPI_File tmp_file;
+			MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);  
+            int tmp[3];
+	    
+			MPI_File_read_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+        	MPI_File_read_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+
+			MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
+
+            Istep_average = tmp[0];
+            tt_average = *(REAL*)(tmp+1);
+            if(my_id == 0) printf("Istep_average=%d , tt_average=%lf\n", Istep_average, tt_average);
+
+			init_time_average();
+
+            read_3d1(tmp_file, offset, pdm);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pum);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pvm);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pwm);
+			offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+            read_3d1(tmp_file, offset, pTm);
+
+            MPI_File_close(&tmp_file);
+    
+	//--    ----------------------
+	        if (my_id == 0)
+				printf("read average_data ok\n");
+				
+			memcpy_inner(pdm , pdm_d->ptr , pdm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pum , pum_d->ptr , pum_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pvm , pvm_d->ptr , pvm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pwm , pwm_d->ptr , pwm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+			memcpy_inner(pTm , pTm_d->ptr , pTm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
+		}
+	    average_IO = 0;
+	}
+	//---------------------
+}
+//----------------------------------------------------------------------------------
+
+//================================================================================
+void OCFD_save(
+	int Iflag_av,
+	int Istep_name,
+	REAL * pd,
+	REAL * pu,
+	REAL * pv,
+	REAL * pw,
+	REAL * pT)
+{
+							    
+	// Iflag_av==0, write opencfd file; ==1, write averaged data file
+
+	char filename1[120];
+	//-------------------------------------------
+	MPI_File tmp_file;
+	int tmp[3];
+	int size_tmp = sizeof(tmp);
+
+        if(Iflag_av == 0){
+            sprintf(filename1, "OCFD%08d.dat", Istep_name);
+		}else{
+		    sprintf(filename1, "OCFD%08d.average", Istep_name);
+		}
+        if(my_id == 0) printf("write data file: %s\n", filename1);
+	
+	MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &tmp_file);
+
+	if(Iflag_av == 0){
+            
+	    tmp[0] = Istep;
+        *(REAL*)(tmp + 1) = tt;
+
+	    MPI_File_write_at_all(tmp_file, 0, &size_tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int)+sizeof(REAL), &size_tmp, 1, MPI_INT, &status);
+	}else{
+            
+	    tmp[0] = Istep_average;
+        *(REAL*)(tmp + 1) = tt_average;
+	    
+	    MPI_File_write_at_all(tmp_file, 0, &size_tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
+	    MPI_File_write_at_all(tmp_file, 2*sizeof(int)+sizeof(REAL), &size_tmp, 1, MPI_INT, &status);
+	}
+
+	MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
+
+	write_3d1(tmp_file, offset, pd);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pu);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pv);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pw);
+	offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
+	write_3d1(tmp_file, offset, pT);
+
+    MPI_File_close(&tmp_file);
+
+	//if (my_id == 0)
+	//{
+	//	if (Iflag_av == 0)
+	//	{
+	//		printf("write data OK\n");
+	//		tmp_file = fopen("Opencfd.msg", "a");
+	//		fprintf(tmp_file, "%d", Istep_name);
+	//		fclose(tmp_file);
+	//	}
+	//}
+}
+//-------------------------------------------------------------------------------------------
+
+//---------------------------------------------------------------------------------------------
+void write_3d1(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+	int i, j, k;
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL(*U1)
+	[ny][nx] = (REAL(*)[ny][nx])malloc(nx * ny * nz * sizeof(REAL));
+	REAL *pU1 = (REAL*)U1;
+	for (k = LAP; k < nz + LAP; k++)
+	{
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				(*pU1++) = U[k][j][i];
+			}
+		}
+	}
+	pU1 = &(U1[0][0][0]);
+	write_3d(file, offset, pU1);
+	free(U1);
+}
+
+void read_3d1(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+
+	int i, j, k;
+	REAL(*U1)
+	[ny][nx] = (REAL(*)[ny][nx])malloc(nx * ny * nz * sizeof(REAL));
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL *pU1 = (REAL*)U1;
+	read_3d(file, offset, pU1);
+	for (k = LAP; k < nz + LAP; k++)
+	{
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U[k][j][i] = (*pU1++);
+			}
+		}
+	}
+	free(U1);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_IO_mpi.c
+++ b/src/OCFD_IO_mpi.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpi.h"
+
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_mpi.h"
+#include "io_warp.h"
+
+//---------------------------------------------------
+#ifdef __cplusplus
+extern "C"{
+#endif
+//void write_2d_XYa(
+//	FILE *file,
+//	int ka,
+//	int size_x,
+//	int size_y,
+//	int lap,
+//	int *pU)
+//{
+//
+//	int(*U)
+//	[size_y + 2*lap][size_x + 2*lap] = (int(*)[size_y + 2*lap][size_x + 2*lap])(pU);
+//	int(*U2d)
+//	[NX_GLOBAL], (*U0)[NX_GLOBAL];
+//	int node_k, k_local;
+//
+//	U2d = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+//	memset((void*)U2d, 0, NX_GLOBAL * NY_GLOBAL * sizeof(int));
+//	if (my_id == 0){
+//		U0 = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+//	}
+//	//--------------------------------
+//	get_k_node(ka, &node_k, &k_local);
+//	k_local += lap;
+//	int i, j;
+//	if(npz == node_k){
+//		for (j = lap; j < ny + lap; j++)
+//		{
+//			for (i = lap; i < nx + lap; i++)
+//			{
+//				U2d[j - lap + j_offset[npy]][i - lap + i_offset[npx]] = U[k_local][j][i];
+//			}
+//		}
+//	}
+//	MPI_Reduce(&U2d[0][0], &U0[0][0], NX_GLOBAL * NY_GLOBAL, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+////	if (my_id == 0)
+////		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+//
+//	if(my_id == 0){
+//		for(j = 0; j < NY_GLOBAL; j++){
+//			for(i = 0; i < NX_GLOBAL; i++){
+//				fprintf(file, "%08d\n", U0[j][i]);
+//			}
+//		}
+//	}
+//
+//	free(U2d);
+//	if (my_id == 0)
+//		free(U0);
+//}
+void write_2d_XY(
+	FILE *file,
+	int ka,
+	int size_x,
+	int size_y,
+	int lap,
+	int *pU,
+	REAL *pU1)
+{
+
+	int(*U)[size_y + 2*lap][size_x + 2*lap] = (int(*)[size_y + 2*lap][size_x + 2*lap])(pU);
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	int(*U2d)[NX_GLOBAL], (*U0)[NX_GLOBAL];
+	REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
+	int node_k, k_local;
+
+	U2d = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+	U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+	memset((void*)U2d, 0, NX_GLOBAL * NY_GLOBAL * sizeof(int));
+	memset((void*)U2d1, 0, NX_GLOBAL * NY_GLOBAL * sizeof(REAL));
+	if (my_id == 0){
+		U0 = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
+		U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+	}
+	//--------------------------------
+	get_k_node(ka, &node_k, &k_local);
+	k_local += lap;
+	int i, j;
+	if(npz == node_k){
+		for (j = lap; j < ny + lap; j++)
+		{
+			for (i = lap; i < nx + lap; i++)
+			{
+				U2d[j - lap + j_offset[npy]][i - lap + i_offset[npx]] = U[k_local][j][i+1];
+			}
+		}
+
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U2d1[j - LAP + j_offset[npy]][i - LAP + i_offset[npx]] = U1[k_local + LAP][j][i+1];
+			}
+		}
+	}
+	MPI_Reduce(&U2d[0][0], &U0[0][0], NX_GLOBAL * NY_GLOBAL, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	if(my_id == 0){
+		for(j = 0; j < NY_GLOBAL; j++){
+			for(i = 0; i < NX_GLOBAL; i++){
+				fprintf(file, "%08d%15.6lf\n", U0[j][i], U01[j][i]);
+			}
+		}
+	}
+
+	free(U2d);
+	free(U2d1);
+	if (my_id == 0){
+		free(U0);
+		free(U01);
+	}
+}
+
+
+void write_2d_XYa(
+	FILE *file,
+	int ka,
+	REAL *pU1)
+{
+
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
+	int node_k, k_local;
+
+	U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+
+	memset((void*)U2d1, 0, NX_GLOBAL * NY_GLOBAL * sizeof(REAL));
+
+	if (my_id == 0){
+		U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+	}
+	//--------------------------------
+	get_k_node(ka, &node_k, &k_local);
+
+	int i, j;
+	if(npz == node_k){
+		for (j = LAP; j < ny + LAP; j++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U2d1[j - LAP + j_offset[npy]][i - LAP + i_offset[npx]] = U1[k_local + LAP][j][i];
+			}
+		}
+	}
+
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	if(my_id == 0) FWRITE(U01, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+
+	free(U2d1);
+	if (my_id == 0){
+		free(U01);
+	}
+}
+
+void write_2d_YZa(
+	FILE *file,
+	int ia,
+	REAL *pU1)
+{
+
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	REAL(*U2d1)[NY_GLOBAL], (*U01)[NY_GLOBAL];
+	int node_i, i_local;
+
+	U2d1 = (REAL(*)[NY_GLOBAL])malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
+
+	memset((void*)U2d1, 0, NY_GLOBAL * NZ_GLOBAL * sizeof(REAL));
+
+	if (my_id == 0){
+		U01 = (REAL(*)[NY_GLOBAL])malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
+	}
+	//--------------------------------
+	get_i_node(ia, &node_i, &i_local);
+
+	int j, k;
+	if(npx == node_i){
+		for (k = LAP; k < nz + LAP; k++)
+		{
+			for (j = LAP; j < ny + LAP; j++)
+			{
+				U2d1[k - LAP + k_offset[npz]][j - LAP + j_offset[npy]] = U1[k][j][i_local + LAP];
+			}
+		}
+	}
+
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NY_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	if(my_id == 0) FWRITE(U01, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	free(U2d1);
+	if (my_id == 0){
+		free(U01);
+	}
+}
+
+
+void write_2d_XZa(
+	FILE *file,
+	int ja,
+	REAL *pU1)
+{
+
+	REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
+	REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
+	int node_j, j_local;
+
+	U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NZ_GLOBAL);
+
+	memset((void*)U2d1, 0, NX_GLOBAL * NZ_GLOBAL * sizeof(REAL));
+
+	if (my_id == 0){
+		U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NZ_GLOBAL);
+	}
+	//--------------------------------
+	get_j_node(ja, &node_j, &j_local);
+
+	int i, k;
+	if(npy == node_j){
+		for (k = LAP; k < nz + LAP; k++)
+		{
+			for (i = LAP; i < nx + LAP; i++)
+			{
+				U2d1[k - LAP + k_offset[npz]][i - LAP + i_offset[npx]] = U1[k][j_local + LAP][i];
+			}
+		}
+	}
+
+	MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+
+	if(my_id == 0) FWRITE(U01, sizeof(REAL), NX_GLOBAL * NZ_GLOBAL, file)
+
+	free(U2d1);
+	if (my_id == 0){
+		free(U01);
+	}
+}
+
+
+//--------------------------------------------------------------
+//-----Write a 2D Y-Z (j-k) plane from 3-D array
+//void write_2d_YZa(
+//	FILE *file,
+//	int ia,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+//	REAL(*U2d), (*U0);
+//	int node_i, i_local;
+//
+//	U2d = (REAL *)malloc(sizeof(REAL) * ny * nz);
+//	if (my_id == 0)
+//		U0 = (REAL *)malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
+//	//--------------------------------
+//	get_i_node(ia, &node_i, &i_local);
+//	i_local += LAP;
+//	int k, j;
+//	REAL *tmp = U2d;
+//	for (k = LAP; k < nz + LAP; k++)
+//	{
+//		for (j = LAP; j < ny + LAP; j++)
+//		{
+//			(*tmp++) = U[k][j][i_local];
+//		}
+//	}
+//
+//	for (int proc_k = 0; proc_k < NPZ0; k++)
+//	{
+//		for (int kk = k_offset[proc_k]; kk < k_offset[proc_k] + k_nn[proc_k]; kk++)
+//		{
+//			for (int proc_j = 0; proc_j < NPY0; proc_j++)
+//			{
+//				if (npx == node_i && npy == proc_j && npz == proc_k)
+//				{
+//					k = kk - k_offset[proc_k];
+//					MPI_Bsend(U2d + k * ny, ny, OCFD_DATA_TYPE, 0, kk, MPI_COMM_WORLD);
+//				}
+//				if (my_id == 0)
+//				{
+//					int recv_offset = j_offset[proc_j] + NY_GLOBAL * kk;
+//					MPI_Status status;
+//					MPI_Recv(U0 + recv_offset, j_nn[proc_j], OCFD_DATA_TYPE, PROCIdx2Num(node_i, proc_j, proc_k), kk, MPI_COMM_WORLD, &status);
+//				}
+//			}
+//			MPI_Barrier(MPI_COMM_WORLD);
+//		}
+//	}
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
+//
+//	free(U2d);
+//	if (my_id == 0)
+//		free(U0);
+//}
+
+//-------------------------------------------------
+//----Write a 2d xz-plane from 3d array------------------------
+
+//void write_2d_XZa(
+//	FILE *file,
+//	int ja,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+//	REAL(*U2d), (*U0);
+//	int node_j, j_local;
+//
+//	U2d = (REAL *)malloc(sizeof(REAL) * nx * nz);
+//	if (my_id == 0)
+//		U0 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+//	//--------------------------------
+//	get_j_node(ja, &node_j, &j_local);
+//	j_local += LAP;
+//	int k, i;
+//	REAL *tmp = U2d;
+//	for (k = LAP; k < nz + LAP; k++)
+//	{
+//		for (i = LAP; i < nx + LAP; i++)
+//		{
+//			(*tmp++) = U[k][j_local][i];
+//		}
+//	}
+//	for (int proc_k = 0; proc_k < NPZ0; k++)
+//	{
+//		for (int kk = k_offset[proc_k]; kk < k_offset[proc_k] + k_nn[proc_k]; kk++)
+//		{
+//			for (int proc_i = 0; proc_i < NPX0; proc_i++)
+//			{
+//				if (npy == node_j && npx == proc_i && npz == proc_k)
+//				{
+//					k = kk - k_offset[proc_k];
+//					MPI_Bsend(U2d + k * nx, nx, OCFD_DATA_TYPE, 0, kk, MPI_COMM_WORLD);
+//				}
+//				if (my_id == 0)
+//				{
+//					int recv_offset = i_offset[proc_i] + NX_GLOBAL * kk;
+//					MPI_Status status;
+//					MPI_Recv(U0 + recv_offset, i_nn[proc_i], OCFD_DATA_TYPE, PROCIdx2Num(proc_i, node_j, proc_k), kk, MPI_COMM_WORLD, &status);
+//				}
+//			}
+//			MPI_Barrier(MPI_COMM_WORLD);
+//		}
+//	}
+//	if (my_id == 0)
+//		FWRITE(U0, sizeof(REAL), NX_GLOBAL * NZ_GLOBAL, file)
+//
+//	free(U2d);
+//	if (my_id == 0)
+//		free(U0);
+//}
+//--------------------------------------------------
+
+//----Write points from 3d array------------------------
+// 需要明确外界输入文件中，ia，ja，ka所使用的下标体系
+void write_points(
+	FILE *file,
+	REAL *pU,
+	int mpoints,
+	int *ia,
+	int *ja,
+	int *ka)
+{
+	int node_i, node_j, node_k, i_local, j_local, k_local;
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL *U1;
+	U1 = (REAL *)malloc(sizeof(REAL) * mpoints);
+	//--------------------------------
+	for (int m = 0; m < mpoints; m++)
+	{
+		get_i_node(ia[m], &node_i, &i_local);
+		get_j_node(ja[m], &node_j, &j_local);
+		get_k_node(ka[m], &node_k, &k_local);
+		if (npx == node_i && npy == node_j && npz == node_k)
+		{
+			MPI_Bsend(&U[k_local + LAP][j_local + LAP][i_local + LAP], 1, OCFD_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+		}
+		if (my_id == 0)
+		{
+			MPI_Status status;
+			MPI_Recv(&U1[m], 1, OCFD_DATA_TYPE, PROCIdx2Num(node_i, node_j, node_k), 0, MPI_COMM_WORLD, &status);
+		}
+	}
+	if (my_id == 0)
+		FWRITE(U1, sizeof(REAL), mpoints, file)
+	free(U1);
+}
+
+//--------------------------------------------------
+
+//--------------------------------------------------
+//void read_3d(
+//	FILE *file,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny][nx] = PTR2ARRAY2(pU, nx, ny);
+//
+//	REAL(*buff2d)
+//	[NX_GLOBAL], (*buff1)[NX_GLOBAL], *buff2, *buff_recv;
+//	int sendcounts1[NPY0], displs1[NPY0], sendcounts2[NPX0], displs2[NPX0];
+//	//---------------------------------------------------------------
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			buff2d = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+//		}
+//		buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//		buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);//NY_GLOBAL > ny
+//	}
+//	buff_recv = (REAL *)malloc(sizeof(REAL) * nx * ny);
+//
+//	if (my_id == 0)
+//		printf("read 3d data ...\n");
+//	// sendcounts1 displs1用于j方向分布
+//	for (int j = 0; j < NPY0; j++)
+//	{
+//		sendcounts1[j] = NX_GLOBAL * j_nn[j];
+//		displs1[j] = j_offset[j] * NX_GLOBAL;
+//	}
+//
+//	for (int i = 0; i < NPX0; i++)
+//	{
+//		sendcounts2[i] = ny * i_nn[i];
+//		displs2[i] = i_offset[i] * ny;
+//	}
+//
+//	int proc_k, k_local;
+//	for (int kk = 0; kk < NZ_GLOBAL; kk++)
+//	{
+//		get_k_node(kk, &proc_k, &k_local);
+//		if (my_id == 0)
+//			FREAD(buff2d, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+//
+//		if (proc_k != 0)
+//		{
+//			// k方向发送
+//			MPI_Status status;
+//			if (my_id == 0)
+//				MPI_Send(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, proc_k * (NPX0 * NPY0), 6666, MPI_COMM_WORLD);
+//			if (my_id == proc_k * NPX0 * NPY0)
+//				MPI_Recv(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, 6666, MPI_COMM_WORLD, &status);
+//		}
+//		if (npz == proc_k)
+//		{
+//			// j方向分散
+//			if (npx == 0)
+//			{
+//				MPI_Scatterv(buff2d, sendcounts1, displs1, OCFD_DATA_TYPE, buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
+//
+//				REAL *pbuff_recv;
+//				REAL *ppU;
+//				// i方向数据准备与离散
+//				for (int npx1 = 0; npx1 < NPX0; npx1++)
+//				{
+//					ppU = buff2 + displs2[npx1];
+//					for (int j = 0; j < ny; j++)
+//					{
+//						for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
+//						{
+//							(*ppU++) = buff1[j][i];
+//						}
+//					}
+//				}
+//			}
+//			//buff_recv = buff2;
+//			MPI_Scatterv(buff2, sendcounts2, displs2, OCFD_DATA_TYPE, buff_recv, nx * ny, OCFD_DATA_TYPE, 0, MPI_COMM_X);
+//
+//			// 数据分布
+//			{
+//				REAL *pbuff_recv;
+//				REAL *ppU;
+//				ppU = pU + k_local * nx * ny;
+//				pbuff_recv = buff_recv;
+//				for (int nn = 0; nn < nx * ny; nn++)
+//				{
+//					(*ppU++) = (*pbuff_recv++);
+//				}
+//			}
+//		}
+//	}
+//
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			free(buff2d);
+//		}
+//		free(buff1);
+//		free(buff2);
+//	}
+//	free(buff_recv);
+//}
+
+//void read_3d(
+//	MPI_File file,
+//	REAL *pU)
+//{
+//	size_t displs_start, displs_end, displs_k_start, displs_k_end;
+//
+//    REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//
+//    displs_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
+//    displs_end = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * (NZ_GLOBAL-k_offset[npz]-nz);
+//
+//	displs_k_start = sizeof(int) + (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
+//	displs_k_end = 2*sizeof(int) + (NY_GLOBAL - ny) * NX_GLOBAL * sizeof(REAL);
+//
+//    if (my_id == 0) printf("read 3d data ...\n");
+//
+//    MPI_File_seek(file, displs_start + displs_k_start, MPI_SEEK_CUR);
+//
+//    for(int k=0; k<nz; k++){
+//
+//        MPI_File_read(file, buff_recv, NX_GLOBAL*ny, OCFD_DATA_TYPE, &status);
+//
+//        MPI_File_seek(file, displs_k_end, MPI_SEEK_CUR);
+//
+//    // 数据分布
+//       {
+//           REAL *ppU;
+//       	   ppU = pU + k * nx * ny;
+//           
+//           for(int j=0;j<ny;j++){
+//               for(int i=0;i<nx;i++){
+//           	     *(ppU+j*nx+i) = *(buff_recv+j*NX_GLOBAL+i);
+//               }
+//           }
+//       }
+//    }
+//
+//    MPI_File_seek(file, displs_end - displs_k_start, MPI_SEEK_CUR);		 
+//
+//    free(buff_recv);
+//}
+
+void read_3d(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+	size_t displs_start, displs_k_start;
+
+    REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+
+    displs_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
+
+	displs_k_start = sizeof(int) + (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
+
+	displs_start += displs_k_start + offset;
+
+    if (my_id == 0) printf("read 3d data ...\n");
+
+    for(int k=0; k<nz; k++){
+
+        MPI_File_read_at(file, displs_start, buff_recv, NX_GLOBAL*ny, OCFD_DATA_TYPE, &status);
+
+		displs_start += 2*sizeof(int) + NY_GLOBAL * NX_GLOBAL * sizeof(REAL);
+
+
+    // 数据分布
+       {
+           REAL *ppU;
+       	   ppU = pU + k * nx * ny;
+           
+           for(int j=0;j<ny;j++){
+               for(int i=0;i<nx;i++){
+           	     *(ppU+j*nx+i) = *(buff_recv+j*NX_GLOBAL+i);
+               }
+           }
+       }
+    }	 
+
+    free(buff_recv);
+}
+//------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------
+
+void write_3d(
+	MPI_File file,
+	MPI_Offset offset,
+	REAL *pU)
+{
+
+	REAL(*U)[ny][nx] = PTR2ARRAY2(pU, nx, ny);
+	REAL(*buff1)[NX_GLOBAL], *buff2, *buff_send;
+	int *buff2d;
+
+	size_t size = NX_GLOBAL*NY_GLOBAL*sizeof(REAL);
+	size_t displs_k;
+
+	int recvcounts1[NPY0], displs1[NPY0], recvcounts2[NPX0], displs2[NPX0];
+
+
+	displs_k = k_offset[npz] * (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) + offset;
+
+	if (npx == 0)
+	{
+		if (npy == 0)
+		{
+			buff2d = (int*)malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL + sizeof(int) * 2);
+			*buff2d = size;
+			*(buff2d + 1 + NX_GLOBAL * NY_GLOBAL * 2) = size;
+		}
+		buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
+		buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+	}
+	buff_send = (REAL *)malloc(sizeof(REAL) * nx * ny);
+
+	//---------------------------------------------------------------
+	if (my_id == 0)
+		printf("write 3d data ...\n");
+	// recvconts1 ， displs1 存储j方向收集时所使用的个数与偏移，
+	// 由于j方向收集发生在i方向收集之后，因此只有一列参与j方向收集
+	for (int j = 0; j < NPY0; j++)
+	{
+		recvcounts1[j] = NX_GLOBAL * j_nn[j];
+		displs1[j] = j_offset[j] * NX_GLOBAL;
+	}
+	// i方向收集所需偏移与数量
+	for (int i = 0; i < NPX0; i++)
+	{
+		recvcounts2[i] = ny * i_nn[i];
+		displs2[i] = i_offset[i] * ny;
+	}
+
+	// 按数据的k面进行循环	
+	for (int kk = 0; kk < nz; kk++)
+	{
+		REAL *pbuff_send = (REAL *)buff_send;
+		REAL *ppU = pU + kk * nx * ny;
+		// i方向收集数据准备
+		for (int n = 0; n < nx * ny; n++)
+			(*pbuff_send++) = (*ppU++);
+
+		MPI_Gatherv(buff_send, nx * ny, OCFD_DATA_TYPE, buff2, recvcounts2, displs2, OCFD_DATA_TYPE, 0, MPI_COMM_X);
+
+		if (npx == 0)
+		{
+			// j方向收集数据调序
+			for (int npx1 = 0; npx1 < NPX0; npx1++)
+			{
+				ppU = buff2 + displs2[npx1];
+
+				for (int j = 0; j < ny; j++)
+				{
+					for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
+					{
+						buff1[j][i] = (*ppU++);
+					}
+				}
+			}
+			MPI_Gatherv(buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, (REAL*)(buff2d + 1), recvcounts1, displs1, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
+		}
+		
+
+        if (npx == 0 && npy == 0){
+            MPI_File_write_at(file, displs_k, buff2d, 2*(NX_GLOBAL*NY_GLOBAL+1), MPI_INT, &status);
+		}
+
+		displs_k += 2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL);
+	}
+
+	if (npx == 0)
+	{
+		if (npy == 0)
+		{
+			free(buff2d);
+		}
+		free(buff1);
+		free(buff2);
+	}
+	free(buff_send);
+}
+
+
+//void write_3d(
+//	MPI_File file,
+//	REAL *pU)
+//{
+//    size_t displs_xy;
+//	size_t size = NX_GLOBAL*NY_GLOBAL*sizeof(REAL);
+//    size_t displs_non0_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
+//	size_t displs_non0_end = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL))*(NZ_GLOBAL-k_offset[npz]-nz);
+//
+//    REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * nx * ny);
+//    displs_xy = (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
+//
+//    if(my_id == 0){
+//        for(int i=0; i<k_offset[npz]; i++){
+//            MPI_File_write_all(file, &size, 1, MPI_INT, &status);
+//            MPI_File_seek(file, size, MPI_SEEK_CUR);		
+//            MPI_File_write_all(file, &size, 1, MPI_INT, &status);
+//        }
+//    }else{
+//        MPI_File_seek(file, displs_non0_start, MPI_SEEK_CUR);		
+//    }
+//
+//    for(int k=0; k<nz; k++){
+//    // 数据分布
+//       {
+//           REAL *ppU;
+//       	   ppU = pU + k * nx * ny;
+//           
+//           for(int j=0;j<ny;j++){
+//               for(int i=0;i<nx;i++){
+//		           *(buff_recv+j*nx+i) = *(ppU+j*nx+i);
+//               }
+//           }
+//       }
+//
+//        if(my_id == 0){
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//	    }else{
+//            MPI_File_seek(file, sizeof(int), MPI_SEEK_CUR);
+//	    }
+//
+//        MPI_File_seek(file, displs_xy, MPI_SEEK_CUR);
+//
+//        for(int j = 0; j < ny; j++){
+//            MPI_File_write_all(file, buff_recv + nx*j, nx, OCFD_DATA_TYPE, &status);
+//
+//            MPI_File_seek(file, sizeof(REAL)*(NX_GLOBAL-nx), MPI_SEEK_CUR);
+//        }
+//
+//        MPI_File_seek(file, sizeof(REAL)*((NY_GLOBAL-j_offset[npy]-ny)*NX_GLOBAL-i_offset[npx]), MPI_SEEK_CUR);
+//       
+//       	if(my_id == 0){
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//	    }else{
+//            MPI_File_seek(file, sizeof(int), MPI_SEEK_CUR);
+//	    }
+//
+//   }
+//
+//    if(my_id == 0){
+//        for(int i=0; i<(NZ_GLOBAL-k_offset[npz]-nz); i++){
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//            MPI_File_seek(file, size, MPI_SEEK_CUR);		
+//	        MPI_File_write_all(file, &size, 1,  MPI_INT, &status);
+//        }
+//    }else{
+//        MPI_File_seek(file, displs_non0_end, MPI_SEEK_CUR);		
+//    }
+//
+//   if (my_id == 0) printf("write 3d data ...\n");
+//
+//   free(buff_recv);
+//}
+//------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------
+//void write_3d(
+//	FILE *file,
+//	REAL *pU)
+//{
+//
+//	REAL(*U)
+//	[ny][nx] = PTR2ARRAY2(pU, nx, ny);
+//	REAL(*buff2d)
+//	[NX_GLOBAL], (*buff1)[NX_GLOBAL], *buff2, *buff_send;
+//
+//	int recvcounts1[NPY0], displs1[NPY0], recvcounts2[NPX0], displs2[NPX0];
+//
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			buff2d = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
+//		}
+//		buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//		buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
+//	}
+//	buff_send = (REAL *)malloc(sizeof(REAL) * nx * ny);
+//
+//	//---------------------------------------------------------------
+//	if (my_id == 0)
+//		printf("write 3d data ...\n");
+//	// recvconts1 ， displs1 存储j方向收集时所使用的个数与偏移，
+//	// 由于j方向收集发生在i方向收集之后，因此只有一列参与j方向收集
+//	for (int j = 0; j < NPY0; j++)
+//	{
+//		recvcounts1[j] = NX_GLOBAL * j_nn[j];
+//		displs1[j] = j_offset[j] * NX_GLOBAL;
+//	}
+//	// i方向收集所需偏移与数量
+//	for (int i = 0; i < NPX0; i++)
+//	{
+//		recvcounts2[i] = ny * i_nn[i];
+//		displs2[i] = i_offset[i] * ny;
+//	}
+//
+//	// 按数据的k面进行循环
+//	int proc_k, k_local;
+//	for (int kk = 0; kk < NZ_GLOBAL; kk++)
+//	{
+//		get_k_node(kk, &proc_k, &k_local);
+//		if (npz == proc_k)
+//		{
+//			REAL *pbuff_send = (REAL *)buff_send;
+//			REAL *ppU = pU + k_local * nx * ny;
+//			// i方向收集数据准备
+//			for (int n = 0; n < nx * ny; n++)
+//				(*pbuff_send++) = (*ppU++);
+//			MPI_Gatherv(buff_send, nx * ny, OCFD_DATA_TYPE, buff2, recvcounts2, displs2, OCFD_DATA_TYPE, 0, MPI_COMM_X);
+//
+//			if (npx == 0)
+//			{
+//				// j方向收集数据调序
+//				for (int npx1 = 0; npx1 < NPX0; npx1++)
+//				{
+//					ppU = buff2 + displs2[npx1];
+//
+//					for (int j = 0; j < ny; j++)
+//					{
+//						for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
+//						{
+//							buff1[j][i] = (*ppU++);
+//						}
+//					}
+//				}
+//				MPI_Gatherv(buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, buff2d, recvcounts1, displs1, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
+//			}
+//		}
+//
+//		//
+//
+//		// k 方向收集
+//		if (proc_k != 0)
+//		{
+//			if (npx == 0 && npy == 0 && npz == proc_k)
+//			{
+//				MPI_Send(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, 666, MPI_COMM_WORLD);
+//			}
+//			if (my_id == 0)
+//			{
+//				MPI_Status status;
+//				MPI_Recv(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, proc_k * NPX0 * NPY0, 666, MPI_COMM_WORLD, &status);
+//			}
+//		}
+//		if (my_id == 0)
+//			FWRITE(buff2d, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
+//	}
+//
+//	if (npx == 0)
+//	{
+//		if (npy == 0)
+//		{
+//			free(buff2d);
+//		}
+//		free(buff1);
+//		free(buff2);
+//	}
+//	free(buff_send);
+//}
+
+//------------------------------------------------------------------------------------------------------------------
+//------------------------------------Write blockdata from 3d array-------------------------------------------------
+
+void write_blockdata(
+	FILE *file,
+	REAL *pU,
+	int ib,
+	int ie,
+	int jb,
+	int je,
+	int kb,
+	int ke)
+{
+	int nx1 = ie - ib + 1, ny1 = je - jb + 1, nz1 = ke - kb + 1;
+	int i, j, k, i0, j0, k0, i1, j1, k1;
+
+	REAL(*U)
+	[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
+	REAL U1[nz1][ny1][nx1], U0[nz1][ny1][nx1];
+	//--------------------------------
+	REAL *p = &U1[0][0][0];
+	for (int i = 0; i < nx1 * ny1 * nz1; i++)
+	{
+		(*p++) = 0.0;
+	}
+	p = &U0[0][0][0];
+	for (int i = 0; i < nx1 * ny1 * nz1; i++)
+	{
+		(*p++) = 0.0;
+	}
+
+	// 假设in文件使用fortran下标 , 从1开始
+	ib -= 1;
+	jb -= 1;
+	kb -= 1;
+
+	int gkb = k_offset[npz];
+	int gjb = j_offset[npy];
+	int gib = i_offset[npx];
+
+	for (k = 0; k < nz; k++)
+	{
+		k0 = k + gkb;
+		if (!(k0 >= kb && k0 < ke))
+			continue;
+		k1 = k0 - kb;
+		for (j = 0; j < ny; j++)
+		{
+			j0 = j + gjb;
+			if (!(j0 >= jb && j0 < je))
+				continue;
+			j1 = j0 - jb;
+			for (i = 0; i < nx; i++)
+			{
+				i0 = i + gib;
+				if (!(i0 >= ib && i0 < ie))
+					continue;
+				i1 = i0 - ib;
+				U1[k1][j1][i1] = U[k + LAP][j + LAP][i + LAP];
+			}
+		}
+	}
+	MPI_Reduce(&U1[0][0][0], &U0[0][0][0], nx1 * ny1 * nz1, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if (my_id == 0)
+		FWRITE(&U0[0][0][0], sizeof(REAL), nx1 * ny1 * nz1, file)
+}
+#ifdef __cplusplus
+}
+#endif
+//--------------------------------------------------
--- a/src/OCFD_NS_Jacobian3d.cu
+++ b/src/OCFD_NS_Jacobian3d.cu
+#include <math.h>
+
+#include "OCFD_NS_Jacobian3d.h"
+#include "parameters.h"
+#include "OCFD_Schemes_Choose.h"
+#include "OCFD_split.h"
+
+#include "commen_kernel.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "OCFD_mpi_dev.h"
+#include "parameters_d.h"
+#include "OCFD_flux_charteric.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void du_invis_Jacobian3d_init(cudaJobPackage job_in, cudaStream_t *stream){
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x+2*LAP, size.y+2*LAP, size.z+2*LAP);
+	
+	cudaJobPackage job( dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP), 
+						dim3(job_in.end.x + LAP, job_in.end.y + LAP, job_in.end.z + LAP) );
+						
+	CUDA_LAUNCH(( sound_speed_kernel<<<griddim , blockdim, 0, *stream>>>(*pT_d , *pcc_d , job) ));
+}
+
+
+void du_invis_Jacobian3d_x(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, cudaStream_t *stream){
+
+	OCFD_dx1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, job_in, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
+
+	OCFD_dx2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, job_in, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
+
+}
+
+void du_invis_Jacobian3d_y(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, cudaStream_t *stream){
+
+	OCFD_dy1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d, job_in, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
+
+	OCFD_dy2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d, job_in, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
+
+}
+
+
+void du_invis_Jacobian3d_z(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, cudaStream_t *stream){
+
+	OCFD_dz1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d, job_in, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
+
+	OCFD_dz2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d, job_in, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
+
+}
+
+// ========================================================
+
+void du_viscous_Jacobian3d_init(cudaStream_t *stream){
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+    OCFD_dx0(*pu_d, *puk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pv_d, *pvk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pw_d, *pwk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+	OCFD_dx0(*pT_d, *pTk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
+	
+    OCFD_dy0(*pu_d, *pui_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pv_d, *pvi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pw_d, *pwi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+	OCFD_dy0(*pT_d, *pTi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
+	
+    OCFD_dz0(*pu_d, *pus_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pv_d, *pvs_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pw_d, *pws_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+	OCFD_dz0(*pT_d, *pTs_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
+
+}
+
+
+__device__ void vis_flux_s_ker(
+	vis_flux vf,
+
+	REAL *Akx,
+	REAL *Aix,
+	REAL *Asx,
+	REAL *Aky,
+	REAL *Aiy,
+	REAL *Asy,
+	REAL *Akz,
+	REAL *Aiz,
+	REAL *Asz,
+
+	REAL *Amu,
+
+	REAL *s11,
+	REAL *s12,
+	REAL *s13,
+	REAL *s22,
+	REAL *s23,
+	REAL *s33,
+
+	int x,
+	int y,
+	int z
+){
+	REAL ux, vx, wx;
+	REAL uy, vy, wy;
+	REAL uz, vz, wz;
+	REAL div;
+
+	REAL uk = get_Field(vf.uk, x-LAP, y-LAP, z-LAP);
+	REAL ui = get_Field(vf.ui, x-LAP, y-LAP, z-LAP);
+	REAL us = get_Field(vf.us, x-LAP, y-LAP, z-LAP);
+	REAL vk = get_Field(vf.vk, x-LAP, y-LAP, z-LAP);
+	REAL vi = get_Field(vf.vi, x-LAP, y-LAP, z-LAP);
+	REAL vs = get_Field(vf.vs, x-LAP, y-LAP, z-LAP);
+	REAL wk = get_Field(vf.wk, x-LAP, y-LAP, z-LAP);
+	REAL wi = get_Field(vf.wi, x-LAP, y-LAP, z-LAP);
+	REAL ws = get_Field(vf.ws, x-LAP, y-LAP, z-LAP);
+
+
+	ux=uk* *Akx + ui* *Aix + us* *Asx;
+	vx=vk* *Akx + vi* *Aix + vs* *Asx;
+	wx=wk* *Akx + wi* *Aix + ws* *Asx;
+
+	uy=uk* *Aky + ui* *Aiy + us* *Asy;
+	vy=vk* *Aky + vi* *Aiy + vs* *Asy;
+	wy=wk* *Aky + wi* *Aiy + ws* *Asy;
+		
+	uz=uk* *Akz + ui* *Aiz + us* *Asz;
+	vz=vk* *Akz + vi* *Aiz + vs* *Asz;
+	wz=wk* *Akz + wi* *Aiz + ws* *Asz;
+
+	div=ux+vy+wz;
+			
+	*s11 = (2.0*ux-2.0/3.0*div) * *Amu;
+	*s22 = (2.0*vy-2.0/3.0*div) * *Amu;
+	*s33 = (2.0*wz-2.0/3.0*div) * *Amu;
+
+	*s12 = (uy+vx)* *Amu;
+	*s13 = (uz+wx)* *Amu;
+	*s23 = (vz+wy)* *Amu;
+}
+
+
+__device__ void vis_flux_e_ker(
+	vis_flux vf,
+
+	REAL *Amu,
+	REAL *Akx,
+	REAL *Aky,
+	REAL *Akz,
+	REAL *Aix,
+	REAL *Aiy,
+	REAL *Aiz,
+	REAL *Asx,
+	REAL *Asy,
+	REAL *Asz,
+
+	REAL *s11,
+	REAL *s12,
+	REAL *s13,
+	REAL *s22,
+	REAL *s23,
+	REAL *s33,
+
+	REAL *E1,
+	REAL *E2,
+	REAL *E3,
+
+	int x,
+	int y,
+	int z
+){
+	REAL Tx;
+	REAL Ty;
+	REAL Tz;
+	REAL Amuk;
+
+	REAL Tk = get_Field(vf.Tk, x-LAP, y-LAP, z-LAP);
+	REAL Ti = get_Field(vf.Ti, x-LAP, y-LAP, z-LAP);
+	REAL Ts = get_Field(vf.Ts, x-LAP, y-LAP, z-LAP);
+	REAL u  = get_Field_LAP(vf.u, x, y, z);
+	REAL v  = get_Field_LAP(vf.v, x, y, z);
+	REAL w  = get_Field_LAP(vf.w, x, y, z);
+
+	Amuk=*Amu * vis_flux_init_c_d;
+			
+	Tx=Tk* *Akx + Ti* *Aix + Ts* *Asx;	
+	Ty=Tk* *Aky + Ti* *Aiy + Ts* *Asy;	
+	Tz=Tk* *Akz + Ti* *Aiz + Ts* *Asz;
+
+	*E1=u* *s11 + v* *s12 + w* *s13 + Amuk*Tx;
+	*E2=u* *s12 + v* *s22 + w* *s23 + Amuk*Ty;
+	*E3=u* *s13 + v* *s23 + w* *s33 + Amuk*Tz;
+}
+
+
+__device__ void vis_flus_ev_ker(
+	vis_flux vf,
+
+	REAL *s11,
+	REAL *s12,
+	REAL *s13,
+	REAL *s22,
+	REAL *s23,
+	REAL *s33,
+
+	REAL *E1,
+	REAL *E2,
+	REAL *E3,
+
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+
+	int x,
+	int y,
+	int z
+){
+	REAL akx , aky , akz;
+	{
+		REAL Aj1;
+		Aj1 = get_Field_LAP(vf.Ajac , x,y,z);
+
+		akx = get_Field_LAP(vf.Ax, x, y, z)*Aj1;
+		aky = get_Field_LAP(vf.Ay, x, y, z)*Aj1;
+		akz = get_Field_LAP(vf.Az, x, y, z)*Aj1;
+	}
+	
+	get_Field_LAP(Ev1, x, y, z) = ( akx* *s11 + aky* *s12 + akz* *s13 );
+	get_Field_LAP(Ev2, x, y, z) = ( akx* *s12 + aky* *s22 + akz* *s23 ); 
+	get_Field_LAP(Ev3, x, y, z) = ( akx* *s13 + aky* *s23 + akz* *s33 );
+	get_Field_LAP(Ev4, x, y, z) = ( akx* *E1  + aky* *E2  + akz* *E3  );
+}
+
+
+__global__ void vis_flux_ker(
+
+	vis_flux vf,
+
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+
+	cudaJobPackage job)
+{
+	// eyes on cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+
+    if( x<job.end.x && y<job.end.y && z<job.end.z){
+
+		REAL s11, s12, s13, s22, s23, s33;
+		REAL E1, E2, E3;
+	
+		REAL Akx = get_Field_LAP(vf.Akx, x, y, z);
+		REAL Aix = get_Field_LAP(vf.Aix, x, y, z);
+		REAL Asx = get_Field_LAP(vf.Asx, x, y, z);
+		REAL Aky = get_Field_LAP(vf.Aky, x, y, z);
+		REAL Aiy = get_Field_LAP(vf.Aiy, x, y, z);
+		REAL Asy = get_Field_LAP(vf.Asy, x, y, z);
+		REAL Akz = get_Field_LAP(vf.Akz, x, y, z);
+		REAL Aiz = get_Field_LAP(vf.Aiz, x, y, z);
+		REAL Asz = get_Field_LAP(vf.Asz, x, y, z);
+	
+		REAL Amu = get_Field(vf.Amu, x-LAP, y-LAP, z-LAP);
+
+		vis_flux_s_ker(vf,&Akx,&Aix,&Asx,&Aky,&Aiy,&Asy,&Akz,&Aiz,&Asz,&Amu,&s11,&s12,&s13,&s22,&s23,&s33,x,y,z);
+
+		vis_flux_e_ker(vf,&Amu,&Akx,&Aky,&Akz,&Aix,&Aiy,&Aiz,&Asx,&Asy,&Asz,
+			&s11,&s12,&s13,&s22,&s23,&s33,&E1,&E2,&E3,x,y,z);
+
+		vis_flus_ev_ker(vf,&s11,&s12,&s13,&s22,&s23,&s33,&E1,&E2,&E3,
+			Ev1,Ev2,Ev3,Ev4,x,y,z);
+	}
+}
+
+
+void du_viscous_Jacobian3d_x_init(cudaStream_t *stream){
+
+	dim3 blockdim , griddim;
+
+    uint32_t BlockDimX1 = 8;
+    uint32_t BlockDimY1 = 4;
+    uint32_t BlockDimZ1 = 4;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+	vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
+				                   *pTk_d,*pTi_d,*pTs_d,*pAmu_d,
+		                           *pu_d,*pv_d,*pw_d,*pAkx_d,*pAky_d,*pAkz_d,
+		                           *pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
+
+	CUDA_LAUNCH(( vis_flux_ker<<<griddim , blockdim, 0, *stream>>>(vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
+
+}
+
+void du_viscous_Jacobian3d_x_final(cudaJobPackage job_in, cudaStream_t *stream){
+
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    OCFD_dx0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
+
+	cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) , 
+	                   dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
+
+	int size_du = pdu_d->pitch*ny*nz;
+	cudaField tmp_du;
+	tmp_du.pitch = pdu_d->pitch;
+
+	tmp_du.ptr = pdu_d->ptr + size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_u_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_v_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_w_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_T_d, *pAjac_d, job) ));
+}
+
+void du_viscous_Jacobian3d_y_init(cudaStream_t *stream){
+
+	dim3 blockdim , griddim;
+
+    uint32_t BlockDimX1 = 8;
+    uint32_t BlockDimY1 = 4;
+    uint32_t BlockDimZ1 = 4;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+	vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
+								   *pTk_d,*pTi_d,*pTs_d,*pAmu_d,
+								   *pu_d,*pv_d,*pw_d,*pAix_d,*pAiy_d,*pAiz_d,
+								   *pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
+
+    CUDA_LAUNCH(( vis_flux_ker<<<griddim , blockdim, 0, *stream>>>(vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
+
+}
+
+void du_viscous_Jacobian3d_y_final(cudaJobPackage job_in, cudaStream_t *stream){
+
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    OCFD_dy0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+	OCFD_dy0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
+
+	cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) , 
+					   dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
+					   
+	int size_du = pdu_d->pitch*ny*nz;
+	cudaField tmp_du;
+	tmp_du.pitch = pdu_d->pitch;
+
+	tmp_du.ptr = pdu_d->ptr + size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_u_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_v_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_w_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_T_d, *pAjac_d, job) ));
+}
+
+
+void du_viscous_Jacobian3d_z_init(cudaStream_t *stream){
+
+	dim3 blockdim , griddim;
+
+    uint32_t BlockDimX1 = 8;
+    uint32_t BlockDimY1 = 4;
+    uint32_t BlockDimZ1 = 4;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
+
+	cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+	vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
+								   *pTk_d,*pTi_d,*pTs_d,*pAmu_d,
+								   *pu_d,*pv_d,*pw_d,*pAsx_d,*pAsy_d,*pAsz_d,
+								   *pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
+
+    CUDA_LAUNCH(( vis_flux_ker<<<griddim , blockdim, 0, *stream>>>(vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
+
+}
+
+
+void du_viscous_Jacobian3d_z_final(cudaJobPackage job_in, cudaStream_t *stream){
+
+	dim3 blockdim , griddim, size;
+	jobsize(&job_in, &size);
+
+	cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+	OCFD_dz0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+	OCFD_dz0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
+
+	cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) , 
+					   dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
+					   
+	int size_du = pdu_d->pitch*ny*nz;
+	cudaField tmp_du;
+	tmp_du.pitch = pdu_d->pitch;
+
+	tmp_du.ptr = pdu_d->ptr + size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_u_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_v_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_w_d, *pAjac_d, job) ));
+
+	tmp_du.ptr += size_du;
+	CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_T_d, *pAjac_d, job) ));
+}
+
+__global__ void boundary_symmetry_pole_vis_y_ker_m(
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+	cudaJobPackage job){
+
+	// eyes on Bottom holo cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+		unsigned int y1 = 2*LAP - y;
+
+		get_Field_LAP(Ev1 , x,y,z) = - get_Field_LAP(Ev1 , x,y1,z);
+		get_Field_LAP(Ev2 , x,y,z) =   get_Field_LAP(Ev2 , x,y1,z);
+		get_Field_LAP(Ev3 , x,y,z) = - get_Field_LAP(Ev3 , x,y1,z);
+		get_Field_LAP(Ev4 , x,y,z) = - get_Field_LAP(Ev4 , x,y1,z);
+	}
+}
+
+__global__ void boundary_symmetry_pole_vis_y_ker_p(
+	cudaField Ev1,
+	cudaField Ev2,
+	cudaField Ev3,
+	cudaField Ev4,
+	cudaJobPackage job){
+
+	// eyes on Top holo cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if( x<job.end.x && y<job.end.y && z<job.end.z){
+		unsigned int y1 = 2*(ny_d+LAP-1) - y;
+
+		get_Field_LAP(Ev1 , x,y,z) = - get_Field_LAP(Ev1 , x,y1,z);
+		get_Field_LAP(Ev2 , x,y,z) =   get_Field_LAP(Ev2 , x,y1,z);
+		get_Field_LAP(Ev3 , x,y,z) = - get_Field_LAP(Ev3 , x,y1,z);
+		get_Field_LAP(Ev4 , x,y,z) = - get_Field_LAP(Ev4 , x,y1,z);
+	}
+}
+
+void boundary_symmetry_pole_vis_y(cudaStream_t *stream){
+	dim3 blockdim , griddim;
+//    symmetry or pole boundary condition for viscous term
+    if(IF_SYMMETRY == 1){
+        if(npy == 0){
+		    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , LAP , BlockDimZ , nx , LAP , nz);
+		    cudaJobPackage job(dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap));
+		    CUDA_LAUNCH(( boundary_symmetry_pole_vis_y_ker_m<<<griddim , blockdim, 0, *stream>>>(*pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d , job) ));
+	    }
+	    if(npy == NPY0-1){
+		    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , LAP , BlockDimZ , nx , LAP , nz);
+		    cudaJobPackage job(dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap));
+		    CUDA_LAUNCH(( boundary_symmetry_pole_vis_y_ker_p<<<griddim , blockdim, 0, *stream>>>(*pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d , job) ));
+    	}
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_NS_Solver.cu
+++ b/src/OCFD_NS_Solver.cu
+#ifndef __NS_SOLVER_C
+#define __NS_SOLVER_C
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <netdb.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_NS_Solver.h"
+#include "OCFD_time.h"
+#include "OCFD_mpi.h"
+#include "OCFD_boundary.h"
+#include "OCFD_IO.h"
+#include "OCFD_init.h"
+#include "OCFD_Stream.h"
+#include "OCFD_filtering.h"
+#include "OCFD_ana.h"
+
+#include "OCFD_mpi_dev.h"
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "commen_kernel.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+void NS_solver_real()
+{
+
+    //  initial of Amu, Amu_t ---
+    //  Amu=0.d0;
+    // ----------------initial---------------------------------------------------------
+
+    exchange_boundary_xyz_packed_dev(pd , pd_d);
+    exchange_boundary_xyz_packed_dev(pu , pu_d);
+    exchange_boundary_xyz_packed_dev(pv , pv_d);
+    exchange_boundary_xyz_packed_dev(pw , pw_d);
+    exchange_boundary_xyz_packed_dev(pT , pT_d);
+
+    OCFD_bc();
+    
+    get_Amu();
+
+    if (my_id == 0)
+        printf("init ok\n\n");
+
+    REAL wstart0 , wstart , wend;
+    wstart0 = MPI_Wtime();
+    wstart = wstart0;
+    // -----------------------------------------------------------------------
+    do
+    {
+        {
+            REAL * tmp = pf;
+            pf = pfn;
+            pfn = tmp;
+
+            tmp = pf_d->ptr;
+            pf_d->ptr = pfn_d->ptr;
+            pfn_d->ptr = tmp;
+        }
+
+        // 3-step Runge-Kutta
+        for (int KRK = 1; KRK <= 3; KRK++)
+        {
+            
+            du_comput(KRK);
+            
+            OCFD_time_advance(KRK);
+            
+            get_duvwT();
+
+            OCFD_bc();
+
+            get_Amu();
+
+        }
+
+        Istep++;
+        tt += dt;
+        
+        // ---Filtering -------------------------------
+        filtering(pf, pf_lap, pP);
+
+
+        //modify_NT();
+        for(int i = 0; i < N_ana; i++){
+            if(Istep % Kstep_ana[i] == 0) OCFD_ana(K_ana[i], i);
+        } 
+
+
+        if(Istep % Kstep_show == 0){
+            MPI_Barrier(MPI_COMM_WORLD);
+            wend = MPI_Wtime();
+
+            if(TEST == 1){
+                char hostbuffer[100];
+                char *IPbuffer;
+                struct hostent *host_entry;
+                int hostname = gethostname(hostbuffer, sizeof(hostbuffer));
+
+                host_entry = gethostbyname(hostbuffer);
+
+                IPbuffer = inet_ntoa(*((struct in_addr*)
+                host_entry->h_addr_list[0]));
+
+                printf("Host name: %s; Host IP: %s; GPU time %lf\n" , hostbuffer, IPbuffer, wend - wstart);
+
+                exit(0);
+            }
+
+            REAL E0 = 0.;
+            cudaField E0_d;
+
+            E0_d.pitch = pf_d->pitch; E0_d.ptr = pf_d->ptr + 4 * pf_d->pitch*ny*nz;
+            ana_residual(E0_d, &E0);
+
+            if(isnan(E0)){
+                if(IFLAG_HybridAuto == 1) {
+                    //HybridAuto_scheme_IO();
+                    //MPI_Barrier(MPI_COMM_WORLD);
+                }
+                ana_NAN_and_NT();
+            }
+
+            REAL T0 = 0.;
+            cudaField T0_d;
+
+            T0_d.pitch = pdu_d->pitch; T0_d.ptr = pdu_d->ptr;
+            get_inner(T0_d, *pT_d);
+            ana_residual(T0_d, &T0);
+
+            if(my_id == 0){
+                printf("%lf of %lf ( \033[33m%d\033[0m of %d ) , using \033[36m%lf\033[0m\n", tt , end_time , Istep , end_step  , wend - wstart0);
+                printf("%d steps GPU time %lf\n" ,Kstep_show , wend - wstart);
+                printf("Averaged Total Energy is %lf\n", E0);
+                printf("Averaged Total T is %lf\n", T0);
+                printf("\n");
+            }
+            wstart = MPI_Wtime();
+        }
+
+        
+        // -----------save data---------------------------------------------
+        if(Istep%Kstep_save == 0){
+            memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+            memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+            memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+            memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+            memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+            OCFD_save(0, Istep , pd , pu , pv , pw , pT);
+        }
+        if(end_time <= 0.0) break; //end_time .le. 0  means that stop computation just after saving files
+
+    } while (tt < end_time);
+    // --------------------------------------------------------------------------------------
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (my_id == 0)
+    {
+        wend = MPI_Wtime();
+        printf("OK! opencfd is finished\n");
+        printf("Total GPU time %lf\n" , wend - wstart0);
+    }
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/OCFD_Schemes.cu
+++ b/src/OCFD_Schemes.cu
+#include <math.h>
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_Schemes.h"
+#include "OCFD_bound_Scheme.h"
+
+#include "parameters_d.h"
+#include "OCFD_warp_shuffle.h"
+#include "cuda_utility.h"
+
+#ifdef __cplusplus 
+extern "C"{
+#endif
+
+__device__ int get_data0_kernel(int flagxyz, dim3 *coords, cudaField pf, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
+	int offset = job.start.x + pf.pitch*(job.start.y + ny_2lap_d*job.start.z);
+
+	switch(flagxyz){
+		case 1:
+		case 4:
+		{
+			unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+			unsigned int y = coords->y = blockDim.y * blockIdx.y + threadIdx.y;
+			unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
+
+
+        	if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
+				for(int i = ka1; i <= kb1; i++){
+					stencil[i-ka1] = get_Field_LAP(pf, x+i, y, z, offset);
+				}
+				return 1;
+			}
+		}
+		break;
+
+		case 2:
+		case 5:
+        {
+            unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+            unsigned int y = coords->y = blockDim.y * blockIdx.y + threadIdx.y;
+            unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
+
+            if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
+                for(int i = ka1; i <= kb1; i++){
+                    stencil[i-ka1] = get_Field_LAP(pf, x, y+i, z, offset);
+                }
+                return 2;
+            }
+        }
+		break;
+
+		case 3:
+		case 6:
+        {
+            unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+            unsigned int y = coords->y = blockDim.z * blockIdx.z + threadIdx.z;
+            unsigned int z = coords->z = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
+                for(int i = ka1; i <= kb1; i++){
+                    stencil[i-ka1] = get_Field_LAP(pf, x, y, z+i, offset);
+                }
+                return 3;
+            }
+        }
+		break;
+	}
+
+	return 0;
+}
+
+
+__device__ void put_d0_kernel(dim3 flagxyz, dim3 coords, REAL tmp, cudaField pfy, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+	switch(flagxyz.x){
+		case 1:
+		case 4:
+		get_Field(pfy, x-LAP, y-LAP, z-LAP) = tmp/hx_d;
+		break;
+
+		case 2:
+		case 5:
+		get_Field(pfy, x-LAP, y-LAP, z-LAP) = tmp/hy_d;
+		break;
+
+		case 3:
+		case 6:
+		get_Field(pfy, x-LAP, y-LAP, z-LAP) = tmp/hz_d;
+		break;
+	}
+}
+
+__device__ REAL OCFD_kernel_CD6(REAL *stencil){
+
+	REAL tmp = (45.0*(stencil[4] - stencil[2])
+			    -9.0*(stencil[5] - stencil[1])
+			        +(stencil[6] - stencil[0]))/60.0;
+
+	return tmp;
+}
+
+
+__global__ void OCFD_CD6_kernel(dim3 flagxyzb, cudaField pf, cudaField pfy, cudaJobPackage job){
+	dim3 coords;
+	REAL stencil[7], tmp;
+
+	int ia1 = -3; int ib1 = 3;
+
+
+	int flag = get_data0_kernel(flagxyzb.x, &coords, pf, &stencil[0], ia1, ib1, job);
+
+	if(flag != 0){
+
+		flag =  OCFD_D0bound_scheme_kernel(&tmp, flagxyzb, coords, &stencil[0], ia1, job); 
+
+
+		if(flag != 0) tmp = OCFD_kernel_CD6(&stencil[0]);
+
+	    put_d0_kernel(flagxyzb, coords, tmp, pfy, job);
+
+    }
+}
+
+
+__device__ REAL OCFD_kernel_CD8(REAL *stencil){
+
+	REAL tmp = (672.0*(stencil[5] - stencil[3])
+			   -168.0*(stencil[6] - stencil[2])
+			    +32.0*(stencil[7] - stencil[1])
+			     -3.0*(stencil[8] - stencil[0]))/840.0;
+
+	return tmp;
+}
+
+__global__ void OCFD_CD8_kernel(dim3 flagxyzb, cudaField pf, cudaField pfy, cudaJobPackage job){
+	dim3 coords;
+	REAL stencil[9], tmp;
+
+	int ia1 = -4; int ib1 = 4;
+
+
+	int flag = get_data0_kernel(flagxyzb.x, &coords, pf, &stencil[0], ia1, ib1, job);
+
+	if(flag != 0){
+
+		flag =  OCFD_D0bound_scheme_kernel(&tmp, flagxyzb, coords, &stencil[0], ia1, job); 
+
+		if(flag != 0) tmp = OCFD_kernel_CD8(&stencil[0]);
+
+	    put_d0_kernel(flagxyzb, coords, tmp, pfy, job);
+    
+    }
+}
+
+
+__device__ int get_data_kernel(int flagxyz, dim3 *coords, cudaSoA f, int num, REAL *stencil, int ka1, int kb1, REAL *sort, cudaJobPackage job){
+	
+	int offset = job.start.x + f.pitch*(job.start.y + ny_2lap_d*job.start.z);
+
+	switch(flagxyz){
+		case 1:
+		case 4:
+		{
+			unsigned int x = coords->x = (blockDim.x-1) * blockIdx.x + threadIdx.x;
+			unsigned int y = coords->y = blockDim.y * blockIdx.y + threadIdx.y;
+			unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
+				for(int i = ka1; i <= kb1; i++){
+					stencil[i-ka1] = get_SoA_LAP(f, x+i, y, z, num, offset);
+				}
+				return 1;
+			}
+		}
+		break;
+
+		case 2:
+		{   
+            unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+            unsigned int y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.y;
+            unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
+
+            unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
+            unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
+
+            sort[ID1] = get_SoA_LAP(f, x, y-LAP+1, z, num, offset);
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y-1) && z < (job.end.z-job.start.z))
+            sort[ID1+8] = get_SoA_LAP(f, x, y+LAP+1, z, num, offset);
+
+            __syncthreads();
+
+            for(int i = ka1; i <= kb1; i++){
+                stencil[i-ka1] = sort[ID2+i+3];
+            }
+			
+            x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
+            y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.x;
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 2;            
+		}
+		break;
+
+		case 5:
+		{   
+            unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+            unsigned int y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.y;
+            unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
+
+            unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
+            unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
+
+            sort[ID1] = get_SoA_LAP(f, x, y-LAP, z, num, offset);
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y-1) && z < (job.end.z-job.start.z))
+            sort[ID1+8] = get_SoA_LAP(f, x, y+LAP, z, num, offset);
+
+            __syncthreads();
+
+            for(int i = ka1; i <= kb1; i++){
+                stencil[i-ka1] = sort[ID2+i+LAP];
+            }
+			
+            x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
+            y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.x;
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 2;            
+		}
+		break;
+
+		case 3:
+		{
+            unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+            unsigned int y = coords->y = blockDim.z * blockIdx.z + threadIdx.z;
+            unsigned int z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.y;
+
+            unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
+            unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
+
+            sort[ID1] = get_SoA_LAP(f, x, y, z-LAP+1, num, offset);
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z-1))
+            sort[ID1+8] = get_SoA_LAP(f, x, y, z+LAP+1, num, offset);
+
+            __syncthreads();
+
+            for(int i = ka1; i <= kb1; i++){
+                stencil[i-ka1] = sort[ID2+i+3];
+            }
+			
+            x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
+            z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.x;
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 3;     
+		}
+		break;
+
+		case 6:
+		{
+            unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
+            unsigned int y = coords->y = blockDim.z * blockIdx.z + threadIdx.z;
+            unsigned int z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.y;
+
+            unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
+            unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
+
+            sort[ID1] = get_SoA_LAP(f, x, y, z-LAP, num, offset);
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z-1))
+            sort[ID1+8] = get_SoA_LAP(f, x, y, z+LAP, num, offset);
+
+            __syncthreads();
+
+            for(int i = ka1; i <= kb1; i++){
+                stencil[i-ka1] = sort[ID2+i+LAP];
+            }
+			
+            x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
+            z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.x;
+
+			if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 3;            
+		}
+		break;
+	}
+
+	return 0;
+}
+
+
+__device__ void put_du_p_kernel(dim3 flagxyz, dim3 coords, REAL tmp_r, REAL tmp_l, cudaSoA du, int num, cudaField Ajac, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+	switch(flagxyz.x){
+		case 1:
+		case 4:
+		if(flagxyz.x == 1 && flagxyz.z == 1 && coords.x == 1){
+			//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += 0;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+			//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hx_d;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hx_d);
+		}
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hx_d;
+		break;
+
+		case 2:
+		case 5:
+		if(flagxyz.x == 2 && flagxyz.z == 1 && coords.y == 1){
+			//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += 0;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+			//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hy_d;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hy_d);
+            
+		}
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hy_d;
+		break;
+
+		case 3:
+		case 6:
+		if(flagxyz.x == 3 && flagxyz.z == 1 && coords.z == 1){
+			//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += 0;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+			//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hz_d;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hz_d);
+		}
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hz_d;
+		break;
+	}
+}
+
+
+__device__ void put_du_m_kernel(dim3 flagxyz, dim3 coords, REAL tmp_r, REAL tmp_l, cudaSoA du, int num, cudaField Ajac, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+
+	switch(flagxyz.x){
+		case 1:
+		case 4:
+		if(flagxyz.x == 4 && flagxyz.z == 1 && coords.x == job.end.x-job.start.x-1){
+			//get_SoA(du, x-LAP-1, y-LAP, z-LAP, num) += 0;
+            atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+			//get_SoA(du, x-LAP-1, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x-1, y, z)*(tmp_r - tmp_l)/hx_d;
+            atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x-1, y, z)*(tmp_r - tmp_l)/hx_d);
+		}
+		//get_Field(Ajac, x-LAP-1, y-LAP, z-LAP) = (tmp_r - tmp_l)/hx_d;
+		break;
+
+		case 2:
+		case 5:
+		if(flagxyz.x == 5 && flagxyz.z == 1 && coords.y == job.end.y-job.start.y-1){
+			//get_SoA(du, x-LAP, y-LAP-1, z-LAP, num) += 0;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), 0);
+		}else{
+			//get_SoA(du, x-LAP, y-LAP-1, z-LAP, num) += -get_Field_LAP(Ajac, x, y-1, z)*(tmp_r - tmp_l)/hy_d;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y-1, z)*(tmp_r - tmp_l)/hy_d);
+		}
+		//get_Field(Ajac, x-LAP, y-LAP-1, z-LAP) = (tmp_r - tmp_l)/hy_d;
+		break;
+
+		case 3:
+		case 6:
+		if(flagxyz.x == 6 && flagxyz.z == 1 && coords.z == job.end.z-job.start.z-1){
+			//get_SoA(du, x-LAP, y-LAP, z-LAP-1, num) += 0;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), 0);
+		}else{
+			//get_SoA(du, x-LAP, y-LAP, z-LAP-1, num) += -get_Field_LAP(Ajac, x, y, z-1)*(tmp_r - tmp_l)/hz_d;
+            atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z-1)*(tmp_r - tmp_l)/hz_d);
+		}
+		//get_Field(Ajac, x-LAP, y-LAP, z-LAP-1) = (tmp_r - tmp_l)/hz_d;
+		break;
+	}
+}
+
+// =================================================================================================================================== //
+
+
+
+//---------------------------------------------------------WENO7_SYMBO_P-------------------------------------------------------------//
+//   7th order WENO-SYMBO scheme (Bandwith-Optimized Symmetric WENO scheme), see in Martin et al, J. Comput. Phys. 220, 270-289
+// -----The difference between WENO-SYMOO and WENO-SYMBO is that the coefficients of  is different, other is the same.
+
+__device__ REAL OCFD_weno7_SYMBO_kernel_P(int WENO_LMT_FLAG, REAL *stencil){
+
+	REAL S0, S1, S2, S3, S4;
+	REAL tmp, tmp1, TVmin = 0, TVmax = 1;
+			
+	if(WENO_LMT_FLAG == 1){
+		S0 = fabs(stencil[1] - stencil[0]) + fabs(stencil[2] - stencil[1]) + fabs(stencil[3] - stencil[2]);
+		S1 = fabs(stencil[2] - stencil[1]) + fabs(stencil[3] - stencil[2]) + fabs(stencil[4] - stencil[3]);
+		S2 = fabs(stencil[3] - stencil[2]) + fabs(stencil[4] - stencil[3]) + fabs(stencil[5] - stencil[4]);
+		S3 = fabs(stencil[4] - stencil[3]) + fabs(stencil[5] - stencil[4]) + fabs(stencil[6] - stencil[5]);
+		S4 = fabs(stencil[5] - stencil[4]) + fabs(stencil[6] - stencil[5]) + fabs(stencil[7] - stencil[6]);
+
+		tmp   = fmin(S0,S1);
+		tmp1  = fmin(S2,S3);
+		tmp   = fmin(tmp,tmp1);
+		TVmin = fmin(tmp ,S4);
+
+		tmp   = fmax(S0,S1);
+		tmp1  = fmax(S2,S3);
+		tmp   = fmax(tmp,tmp1);
+		TVmax = fmax(tmp ,S4);
+	}
+		
+	if(TVmax < WENO_TV_Limiter_d*TVmin && TVmax < WENO_TV_MAX_d){
+		S0 = 0.0401954833730;
+		S1 = 0.2493800006710;
+		S2 = 0.4802686256260;
+		S3 = 0.2009775476730;
+		S4 = 0.0291783426580;
+	}else{
+
+		S0 = 0.0; S1 = 0.0; S2 = 0.0; S3 = 0.0; S4 =0.0; 
+
+		// 1st  
+		tmp =  -2.0*stencil[0] +  9.0*stencil[1] - 18.0*stencil[2] + 11.0*stencil[3]; S0 += 720.0*tmp*tmp;
+		tmp =       stencil[1] -  6.0*stencil[2] +  3.0*stencil[3] +  2.0*stencil[4]; S1 += 720.0*tmp*tmp;
+		tmp =  -2.0*stencil[2] -  3.0*stencil[3] +  6.0*stencil[4] -      stencil[5]; S2 += 720.0*tmp*tmp;
+		tmp = -11.0*stencil[3] + 18.0*stencil[4] -  9.0*stencil[5] +  2.0*stencil[6]; S3 += 720.0*tmp*tmp;
+		tmp = -26.0*stencil[4] + 57.0*stencil[5] - 42.0*stencil[6] + 11.0*stencil[7]; S4 += 720.0*tmp*tmp;
+
+		// 2nd 
+		tmp = -6.0*stencil[0] + 24.0*stencil[1] - 30.0*stencil[2] + 12.0*stencil[3]; S0 += 780.0*tmp*tmp;
+		tmp =                    6.0*stencil[2] - 12.0*stencil[3] +  6.0*stencil[4]; S1 += 780.0*tmp*tmp;
+		tmp =                    6.0*stencil[3] - 12.0*stencil[4] +  6.0*stencil[5]; S2 += 780.0*tmp*tmp;
+		tmp = 12.0*stencil[3] - 30.0*stencil[4] + 24.0*stencil[5] -  6.0*stencil[6]; S3 += 780.0*tmp*tmp;
+		tmp = 18.0*stencil[4] - 48.0*stencil[5] + 42.0*stencil[6] - 12.0*stencil[7]; S4 += 780.0*tmp*tmp;
+
+		// 3rd 
+		tmp = -6.0*stencil[0] + 18.0*( stencil[1] - stencil[2] ) + 6.0*stencil[3]; S0 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[1] + 18.0*( stencil[2] - stencil[3] ) + 6.0*stencil[4]; S1 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[2] + 18.0*( stencil[3] - stencil[4] ) + 6.0*stencil[5]; S2 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[3] + 18.0*( stencil[4] - stencil[5] ) + 6.0*stencil[6]; S3 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[4] + 18.0*( stencil[5] - stencil[6] ) + 6.0*stencil[7]; S4 += 781.0*tmp*tmp;
+
+		{
+			tmp   = fmax(S0,S1);
+			tmp1  = fmax(S2,S3);
+			tmp   = fmax(tmp,tmp1);
+			S4 = fmax(tmp ,S4);
+		}
+		
+		{
+			REAL tmp2, tmp3;
+		    tmp =  (0.0401954833730)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
+		    tmp1 = (0.2493800006710)*(2.592e-6+S0)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
+		    tmp2 = (0.4802686256260)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S3)*(2.592e-6+S4);
+		    tmp3 = (0.2009775476730)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S4);
+		    S4 =   (0.0291783426580)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3);
+		    S3 = tmp3;
+		    S2 = tmp2;
+		    S1 = tmp1;
+			S0 = tmp;
+		}
+	}
+    
+	REAL am=S0+S1+S2+S3+S4;
+    
+	tmp1 = 0.0;
+	tmp = -3.0*stencil[0] + 13.0*stencil[1] - 23.0*stencil[2] + 25.0*stencil[3]; tmp1 += S0*tmp;
+	tmp =      stencil[1] -  5.0*stencil[2] + 13.0*stencil[3] +  3.0*stencil[4]; tmp1 += S1*tmp;
+	tmp =     -stencil[2] +  7.0*stencil[3] +  7.0*stencil[4] -      stencil[5]; tmp1 += S2*tmp;
+	tmp =  3.0*stencil[3] + 13.0*stencil[4] -  5.0*stencil[5] +      stencil[6]; tmp1 += S3*tmp;
+	tmp = 25.0*stencil[4] - 23.0*stencil[5] + 13.0*stencil[6] -  3.0*stencil[7]; tmp1 += S4*tmp;
+
+	tmp1 /= (12.0*am);
+
+	return tmp1;
+}
+
+
+__global__ void OCFD_weno7_SYMBO_P_kernel(int i, int WENO_LMT_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+
+	int ia1 = -3; int ib1 = 4;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job); 
+
+		if(flag != 0) tmp_r = OCFD_weno7_SYMBO_kernel_P(WENO_LMT_FLAG, &stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+
+	}
+}
+
+//=====================================================================================================================================================//
+
+
+//----------------------------------------------------------------WENO7_SYMBO_M------------------------------------------------------------------------//
+__device__ REAL OCFD_weno7_SYMBO_kernel_M(int WENO_LMT_FLAG, REAL *stencil){
+
+	REAL S0, S1, S2, S3, S4;
+	REAL tmp, tmp1, TVmin = 0, TVmax = 1;
+
+	if(WENO_LMT_FLAG == 1){
+		S0 = fabs(stencil[7] - stencil[6]) + fabs(stencil[6] - stencil[5]) + fabs(stencil[5] - stencil[4]);
+		S1 = fabs(stencil[6] - stencil[5]) + fabs(stencil[5] - stencil[4]) + fabs(stencil[4] - stencil[3]);
+		S2 = fabs(stencil[5] - stencil[4]) + fabs(stencil[4] - stencil[3]) + fabs(stencil[3] - stencil[2]);
+		S3 = fabs(stencil[4] - stencil[3]) + fabs(stencil[3] - stencil[2]) + fabs(stencil[2] - stencil[1]);
+		S4 = fabs(stencil[3] - stencil[2]) + fabs(stencil[2] - stencil[1]) + fabs(stencil[1] - stencil[0]);
+
+		tmp   = fmin(S0,S1);
+		tmp1  = fmin(S2,S3);
+		tmp   = fmax(tmp,tmp1);
+		TVmax = fmax(tmp ,S4);
+	}
+
+	if(TVmax < WENO_TV_Limiter_d*TVmin && TVmax < WENO_TV_MAX_d){
+		S0 = 0.0401954833730;
+		S1 = 0.2493800006710;
+		S2 = 0.4802686256260;
+		S3 = 0.2009775476730;
+		S4 = 0.0291783426580;
+	}else{
+		S0 = 0.0; S1 = 0.0; S2 = 0.0; S3 = 0.0; S4 =0.0; 
+
+		// 1st  
+		tmp =  -2.0*stencil[7] +  9.0*stencil[6] - 18.0*stencil[5] + 11.0*stencil[4]; S0 += 720.0*tmp*tmp;
+		tmp =       stencil[6] -  6.0*stencil[5] +  3.0*stencil[4] +  2.0*stencil[3]; S1 += 720.0*tmp*tmp;
+		tmp =  -2.0*stencil[5] -  3.0*stencil[4] +  6.0*stencil[3] -      stencil[2]; S2 += 720.0*tmp*tmp;
+		tmp = -11.0*stencil[4] + 18.0*stencil[3] -  9.0*stencil[2] +  2.0*stencil[1]; S3 += 720.0*tmp*tmp;
+		tmp = -26.0*stencil[3] + 57.0*stencil[2] - 42.0*stencil[1] + 11.0*stencil[0]; S4 += 720.0*tmp*tmp;
+
+		// 2nd 
+		tmp = -6.0*stencil[7] + 24.0*stencil[6] - 30.0*stencil[5] + 12.0*stencil[4]; S0 += 780.0*tmp*tmp;
+		tmp =              		 6.0*stencil[5] - 12.0*stencil[4] +  6.0*stencil[3]; S1 += 780.0*tmp*tmp;
+		tmp =                    6.0*stencil[4] - 12.0*stencil[3] +  6.0*stencil[2]; S2 += 780.0*tmp*tmp;
+		tmp = 12.0*stencil[4] - 30.0*stencil[3] + 24.0*stencil[2] -  6.0*stencil[1]; S3 += 780.0*tmp*tmp;
+		tmp = 18.0*stencil[3] - 48.0*stencil[2] + 42.0*stencil[1] - 12.0*stencil[0]; S4 += 780.0*tmp*tmp;
+
+		// 3rd 
+		tmp = -6.0*stencil[7] + 18.0*( stencil[6] - stencil[5] ) + 6.0*stencil[4]; S0 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[6] + 18.0*( stencil[5] - stencil[4] ) + 6.0*stencil[3]; S1 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[5] + 18.0*( stencil[4] - stencil[3] ) + 6.0*stencil[2]; S2 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[4] + 18.0*( stencil[3] - stencil[2] ) + 6.0*stencil[1]; S3 += 781.0*tmp*tmp; 
+		tmp = -6.0*stencil[3] + 18.0*( stencil[2] - stencil[1] ) + 6.0*stencil[0]; S4 += 781.0*tmp*tmp;
+
+		{
+			tmp  = fmax(S0,S1);
+			tmp1 = fmax(S2,S3);
+			tmp  = fmax(tmp,tmp1);
+			S4   = fmax(tmp ,S4);
+		}
+
+		{
+			REAL tmp2, tmp3;
+			tmp =  (0.0401954833730)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
+			tmp1 = (0.2493800006710)*(2.592e-6+S0)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
+			tmp2 = (0.4802686256260)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S3)*(2.592e-6+S4);
+			tmp3 = (0.2009775476730)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S4);
+			S4 =   (0.0291783426580)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3);
+			S3 = tmp3;
+			S2 = tmp2;
+			S1 = tmp1;
+			S0 = tmp;
+		}
+	}
+
+	REAL am=S0+S1+S2+S3+S4;
+
+	tmp1 = 0.0;
+	tmp = -3.0*stencil[7] + 13.0*stencil[6] - 23.0*stencil[5] + 25.0*stencil[4]; tmp1 += S0*tmp;
+	tmp =      stencil[6] -  5.0*stencil[5] + 13.0*stencil[4] +  3.0*stencil[3]; tmp1 += S1*tmp;
+	tmp =    - stencil[5] +  7.0*stencil[4] +  7.0*stencil[3] -      stencil[2]; tmp1 += S2*tmp;
+	tmp =  3.0*stencil[4] + 13.0*stencil[3] -  5.0*stencil[2] +      stencil[1]; tmp1 += S3*tmp;
+	tmp = 25.0*stencil[3] - 23.0*stencil[2] + 13.0*stencil[1] -  3.0*stencil[0]; tmp1 += S4*tmp;
+
+	tmp1 /= (12.0*am);
+
+	return tmp1;
+}
+
+
+__global__ void OCFD_weno7_SYMBO_M_kernel(int i, int WENO_LMT_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+	extern __shared__ REAL sort[];
+            __syncthreads();
+    dim3 coords;
+	REAL stencil[8];
+
+	int ia1 = -4; int ib1 = 3;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+	    REAL tmp_r = 0.0, tmp_l = 0.0; 
+
+		flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
+
+		if(flag != 0) tmp_r = OCFD_weno7_SYMBO_kernel_M(WENO_LMT_FLAG, &stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+//=======================================================================================================================================================//
+
+//---------------------------------------------------------------------WENO7_P---------------------------------------------------------------------------//
+__device__ REAL OCFD_weno7_kernel_P(REAL *stencil){
+
+	REAL S0 =0.0, S1 =0.0, S2 =0.0, S3 =0.0;
+	REAL tmp, tmp1, tmp2;
+
+	tmp  = -2.0*stencil[0] +  9.0*stencil[1] - 18.0*stencil[2] + 11.0*stencil[3]; S0 += 960*tmp*tmp;
+	tmp1 = -6.0*stencil[0] + 24.0*stencil[1] - 30.0*stencil[2] + 12.0*stencil[3]; S0 += (1040.0)*tmp1*tmp1;
+	tmp2 = -6.0*stencil[0] + 18.0*( stencil[1] - stencil[2] ) + 6.0*stencil[3];   S0 += (1043.0)*tmp2*tmp2; 
+	tmp  = tmp * tmp2; S0 += 80.0*tmp;
+
+	tmp  =      stencil[1] - 6.0*stencil[2] +  3.0*stencil[3] + 2.0*stencil[4]; S1 += 960*tmp*tmp;
+	tmp1 =                   6.0*stencil[2] - 12.0*stencil[3] + 6.0*stencil[4]; S1 += (1040.0)*tmp1*tmp1;
+	tmp2 = -6.0*stencil[1] + 18.0*( stencil[2] - stencil[3] ) + 6.0*stencil[4]; S1 += (1043.0)*tmp2*tmp2; 
+	tmp  = tmp * tmp2; S1 += 80.0*tmp;
+		
+	tmp  = -2.0*stencil[2] - 3.0*stencil[3] +  6.0*stencil[4] -   stencil[5]; S2 += 960*tmp*tmp;
+	tmp1 =                 6.0*stencil[3] - 12.0*stencil[4] + 6.0*stencil[5]; S2 += (1040.0)*tmp1*tmp1;
+	tmp2 = -6.0*stencil[2] + 18.0*(stencil[3] - stencil[4]) + 6.0*stencil[5]; S2 += (1043.0)*tmp2*tmp2; 
+	tmp  = tmp * tmp2; S2 += 80.0*tmp;
+		
+	tmp  = -11.0*stencil[3] + 18.0*stencil[4] -  9.0*stencil[5] + 2.0*stencil[6]; S3 += 960*tmp*tmp;
+	tmp1 =  12.0*stencil[3] - 30.0*stencil[4] + 24.0*stencil[5] - 6.0*stencil[6]; S3 += (1040.0)*tmp1*tmp1;
+	tmp2 =  -6.0*stencil[3] + 18.0*( stencil[4] -  stencil[5] ) + 6.0*stencil[6];  S3 += (1043.0)*tmp2*tmp2; 
+	tmp  =  tmp * tmp2; S3 += 80.0*tmp;
+
+
+	tmp  =        ((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
+	tmp1 = (12.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
+	tmp2 = (18.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S3)*(3.456e-4+S3));
+	S3   =  (4.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2));
+	S2   = tmp2;
+	S1   = tmp1;
+	S0   = tmp;
+		
+
+	REAL am=S0+S1+S2+S3;
+
+		
+	tmp1 = 0.0;
+	tmp = -3.0*stencil[0] + 13.0*stencil[1] - 23.0*stencil[2] + 25.0*stencil[3]; tmp1 += S0*tmp;
+	tmp =      stencil[1] -  5.0*stencil[2] + 13.0*stencil[3] +  3.0*stencil[4]; tmp1 += S1*tmp;
+	tmp =     -stencil[2] +  7.0*stencil[3] +  7.0*stencil[4] -      stencil[5]; tmp1 += S2*tmp;
+	tmp =  3.0*stencil[3] + 13.0*stencil[4] -  5.0*stencil[5] +      stencil[6]; tmp1 += S3*tmp;
+
+	tmp1 /= (12.0*am);
+
+	return tmp1;
+}
+
+
+__global__ void OCFD_weno7_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+	extern __shared__ REAL sort[];
+    dim3 coords;
+	REAL stencil[7];
+
+	int ia1 = -3; int ib1 = 3;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job); 
+
+		if(flag != 0) tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+
+	}
+}
+//==============================================================================================================================================================//
+
+
+//------------------------------------------------------------------------WENO7_M-------------------------------------------------------------------------------//
+__device__ REAL OCFD_weno7_kernel_M(REAL *stencil){
+
+	REAL S0 =0.0, S1 =0.0, S2 =0.0, S3 =0.0;
+	REAL tmp, tmp1, tmp2;
+
+
+	tmp  = -2.0*stencil[6] +  9.0*stencil[5] - 18.0*stencil[4] + 11.0*stencil[3]; S0 += 960*tmp*tmp;
+	tmp1 = -6.0*stencil[6] + 24.0*stencil[5] - 30.0*stencil[4] + 12.0*stencil[3]; S0 += (1040.0)*tmp1*tmp1;
+	tmp2 = -6.0*stencil[6] + 18.0*( stencil[5] -  stencil[4] ) + 6.0*stencil[3];  S0 += (1043.0)*tmp2*tmp2; 
+	tmp  = tmp * tmp2; S0 += (80.0) * tmp;
+		
+	tmp  =      stencil[5] -  6.0* stencil[4] + 3.0*stencil[3] + 2.0*stencil[2]; S1 += 960*tmp*tmp;
+	tmp1 =                  6.0*stencil[4] - 12.0* stencil[3] +  6.0*stencil[2]; S1 += (1040.0)*tmp1*tmp1;
+	tmp2 = -6.0*stencil[5] + 18.0*( stencil[4] -  stencil[3]) + 6.0*stencil[2];  S1 += (1043.0)*tmp2*tmp2; 
+	tmp  = tmp *tmp2; S1 += (80.0) * tmp;
+		
+	tmp  =    -2.0*stencil[4] - 3.0*stencil[3] + 6.0*stencil[2] - stencil[1]; S2 += 960*tmp*tmp;
+	tmp1 =                 6.0*stencil[3] - 12.0*stencil[2] + 6.0*stencil[1]; S2 += (1040.0)*tmp1*tmp1;
+	tmp2 = -6.0*stencil[4] + 18.0*(stencil[3] - stencil[2]) + 6.0*stencil[1]; S2 += (1043.0)*tmp2*tmp2; 
+	tmp  = tmp *tmp2; S2 += (80.0) * tmp;
+		
+	tmp  = -11.0*stencil[3] + 18.0*stencil[2] -  9.0*stencil[1] + 2.0*stencil[0]; S3 += 960*tmp*tmp;
+	tmp1 =  12.0*stencil[3] - 30.0*stencil[2] + 24.0*stencil[1] - 6.0*stencil[0]; S3 += (1040.0)*tmp1*tmp1;
+	tmp2 =  -6.0*stencil[3] + 18.0*( stencil[2] -  stencil[1] ) + 6.0*stencil[0]; S3 += (1043.0)*tmp2*tmp2; 
+	tmp  =  tmp *tmp2; S3 += (80.0) * tmp;
+
+
+
+	tmp  =        ((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
+	tmp1 = (12.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
+	tmp2 = (18.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S3)*(3.456e-4+S3));
+	S3   =  (4.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2));
+	S2   = tmp2;
+	S1   = tmp1;
+	S0   = tmp;
+		
+
+	REAL am=S0+S1+S2+S3;
+
+
+	tmp1 = 0.0;
+	tmp = -3.0*stencil[6] + 13.0*stencil[5] - 23.0*stencil[4] + 25.0*stencil[3]; tmp1 += S0*tmp;
+	tmp =      stencil[5] -  5.0*stencil[4] + 13.0*stencil[3] +  3.0*stencil[2]; tmp1 += S1*tmp;
+	tmp =     -stencil[4] +  7.0*stencil[3] +  7.0*stencil[2] -      stencil[1]; tmp1 += S2*tmp;
+	tmp =  3.0*stencil[3] + 13.0*stencil[2] -  5.0*stencil[1] +      stencil[0]; tmp1 += S3*tmp;
+
+	tmp1 /= (12.0*am);
+
+	return tmp1;
+}
+
+
+__global__ void OCFD_weno7_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+	extern __shared__ REAL sort[];
+    dim3 coords;
+	REAL stencil[7];
+
+	int ia1 = -3; int ib1 = 3;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+	    REAL tmp_r, tmp_l; 
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0) tmp_r = OCFD_weno7_kernel_M(&stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+//==============================================================================================================================================================//
+
+__device__ REAL sign(REAL x1, REAL x2){
+	if(x2 >=0){
+		x1 = fabs(x1);
+	}else{
+		x1 = -fabs(x1);
+	}
+	return x1;
+}
+
+__device__ REAL minmod2(REAL x1, REAL x2){
+
+	REAL minmod2 = 0.5*(sign(1.0, x1) + sign(1.0, x2))*fmin(fabs(x1),fabs(x2));
+
+    return minmod2;
+}
+
+__device__ REAL minmod4(REAL x1, REAL x2, REAL x3, REAL x4){
+
+	REAL minmod4 = 0.5*(sign(1.0, x1) + sign(1.0, x2));
+
+	minmod4 = minmod4*fabs(0.5*(sign(1.0, x1) + sign(1.0, x3)));
+	minmod4 = minmod4*fabs(0.5*(sign(1.0, x1) + sign(1.0, x4)));
+
+	REAL tmp  = fmin(x1, x2);
+	REAL tmp1 = fmin(x3, x4);
+	tmp = fmin(tmp,tmp1);
+
+	minmod4 = minmod4*tmp;
+
+    return minmod4;
+}
+
+//===================================================================2order_NND========================================================================//
+__device__ REAL OCFD_NND2_kernel_P(REAL *stencil){
+
+	REAL tmp = stencil[1] + 0.5*minmod2(stencil[2] - stencil[1], stencil[1] - stencil[0]);
+
+	return tmp;
+}
+
+
+__global__ void OCFD_NND2_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[5];
+
+	int ia1 = -2; int ib1 = 2;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job); 
+
+		if(flag != 0) tmp_r = OCFD_NND2_kernel_P(&stencil[1]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------
+__device__ REAL OCFD_NND2_kernel_M(REAL *stencil){
+
+	REAL tmp = stencil[1] + 0.5*minmod2(stencil[0] - stencil[1], stencil[1] - stencil[2]);
+
+	return tmp;
+}
+
+
+__global__ void OCFD_NND2_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[5];
+
+	int ia1 = -2; int ib1 = 2;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+	    REAL tmp_r, tmp_l; 
+
+		flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0) tmp_r = OCFD_NND2_kernel_M(&stencil[1]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+//==================================================================================================================================
+
+
+//===================================================================2order_NND========================================================================//
+__device__ REAL OCFD_UP7_kernel_P(REAL *stencil){
+
+	//REAL tmp = (3.0*stencil[0] - 28.0*stencil[1] + 126.0*stencil[2] - 420.0*stencil[3] + 105.0*stencil[4]
+	//		+ 252.0*stencil[5] - 42.0*stencil[6] + 4.0*stencil[7])/420.0;
+	REAL tmp = (-3.0*stencil[0] + 25.0*stencil[1] - 101.0*stencil[2] + 319.0*stencil[3] + 214.0*stencil[4]
+			- 38.0*stencil[5] + 4.0*stencil[6])/420.0;
+
+	return tmp;
+}
+
+
+__global__ void OCFD_UP7_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+
+	int ia1 = -3; int ib1 = 4;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job); 
+
+		if(flag != 0) tmp_r = OCFD_UP7_kernel_P(&stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+
+
+//----------------------------------------------------------------------------------------------------------------------------
+__device__ REAL OCFD_UP7_kernel_M(REAL *stencil){
+
+	//REAL tmp = -(3.0*stencil[7] - 28.0*stencil[6] + 126.0*stencil[5] - 420.0*stencil[4] + 105.0*stencil[3]
+	//	+ 252.0*stencil[2] - 42.0*stencil[1] + 4.0*stencil[0])/420.0;
+	REAL tmp = (-3.0*stencil[7] + 25.0*stencil[6] - 101.0*stencil[5] + 319.0*stencil[4] + 214.0*stencil[3]
+		- 38.0*stencil[2] + 4.0*stencil[1])/420.0;
+
+	return tmp;
+}
+
+
+__global__ void OCFD_UP7_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+
+	int ia1 = -4; int ib1 = 3;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+	    REAL tmp_r, tmp_l; 
+
+		flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0) tmp_r = OCFD_UP7_kernel_M(&stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+//==================================================================================================================================
+
+
+//===========================================================OMP6===================================================================
+__device__ REAL OCFD_OMP6_kernel_P(int OMP6_FLAG, REAL *stencil){
+
+	REAL m,n;
+
+	if(OMP6_FLAG == 1){
+		m = 0.001; n = 0.0;
+	}else if(OMP6_FLAG == 2){
+		m = 0.0; n = - 1.0/140.0;
+	}else{
+		m = 0.015; n = 0.0;
+	}
+
+	REAL mid_nf = 0.5*(m + n); m = 0.5*(m - n); 
+
+	mid_nf =             ( 60.0*mid_nf  * stencil[7] + 
+	  ( 1.0 -   60.0*m -  360.0*mid_nf) * stencil[6] + 
+	  (-8.0 +  360.0*m +  900.0*mid_nf) * stencil[5] + 
+	  (37.0 -  900.0*m - 1200.0*mid_nf) * stencil[4] + 
+	  (37.0 + 1200.0*m +  900.0*mid_nf) * stencil[3] + 
+	  (-8.0 -  900.0*m -  360.0*mid_nf) * stencil[2] + 
+	  (1.0  +  360.0*m +   60.0*mid_nf) * stencil[1] - 
+							     60.0*m * stencil[0])/60.0;
+
+	m = stencil[3] + minmod2((stencil[4] - stencil[3]), (stencil[3] - stencil[2]));
+
+	if((mid_nf - stencil[3])*(mid_nf - m) >= 1.e-10){
+		REAL tmp, tmp1;
+
+		m = stencil[2] + stencil[4] - 2.0*stencil[3];
+		n = stencil[3]  + stencil[5] - 2.0*stencil[4];
+		tmp  = 4.0*m - n;
+		tmp1 = 4.0*n - m;
+
+		tmp = 0.5*(stencil[3] + stencil[4]) - 0.5*minmod4(tmp, tmp1, n, m);
+
+		n = stencil[1] + stencil[3] - 2.0*stencil[2];
+		tmp1 = stencil[3] + 0.5*(stencil[3] - stencil[2]) + 4.0*minmod4(4*n - m, 4*m - n, m, n)/3.0;
+
+		{
+			m = fmin(stencil[3], stencil[4]);
+			m = fmin(m, tmp);
+
+			n = stencil[3] + 4.0*(stencil[3] - stencil[2]);
+
+			n = fmin(stencil[3], n);
+			n = fmin(n, tmp1);
+
+			m = fmax(m, n);
+		}
+		{
+			tmp = fmax(stencil[3], tmp);
+			tmp = fmax(stencil[4], tmp);
+
+			n = stencil[3] + 4.0*(stencil[3] - stencil[2]);
+
+			n = fmax(stencil[3], n);
+			n = fmax(n, tmp1);
+
+			n = fmin(tmp, n);
+		}
+
+		mid_nf = mid_nf + minmod2(n - mid_nf, m - mid_nf);
+	}
+
+	return mid_nf;
+}
+
+
+__global__ void OCFD_OMP6_P_kernel(int i, int OMP6_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+
+	int ia1 = -3; int ib1 = 4;
+	
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0) tmp_r = OCFD_OMP6_kernel_P(OMP6_FLAG, &stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+//---------------------------------------------------------OMP6_M---------------------------------------------------------------
+
+__device__ REAL OCFD_OMP6_kernel_M(int OMP6_FLAG, REAL *stencil){
+
+	REAL m,n;
+
+	if(OMP6_FLAG == 1){
+		m = 0.001; n = 0.0;
+	}else if(OMP6_FLAG == 2){
+		m = 0.0; n = - 1.0/140.0;
+	}else{
+		m = 0.015; n = 0.0;
+	}
+
+	REAL mid_nf = 0.5*(m + n); m = 0.5*(m - n); 
+
+	mid_nf =            (  60.0*mid_nf  * stencil[0] + 
+	   (1.0 -   60.0*m -  360.0*mid_nf) * stencil[1] + 
+	  (-8.0 +  360.0*m +  900.0*mid_nf) * stencil[2] + 
+	  (37.0 -  900.0*m - 1200.0*mid_nf) * stencil[3] + 
+	  (37.0 + 1200.0*m +  900.0*mid_nf) * stencil[4]  + 
+	  (-8.0 -  900.0*m -  360.0*mid_nf) * stencil[5]  + 
+	   (1.0 +  360.0*m +   60.0*mid_nf) * stencil[6]  - 
+							     60.0*m * stencil[7])/60.0;
+
+	m = stencil[4] + minmod2((stencil[3] - stencil[4]), (stencil[4] - stencil[5]));
+
+	if((mid_nf - stencil[4])*(mid_nf - m) >= 1.e-10){
+		REAL tmp, tmp1;
+
+		m = stencil[5] + stencil[3] - 2.0*stencil[4];
+		n = stencil[4] + stencil[2] - 2.0*stencil[3];
+		tmp  = 4.0*m - n;
+		tmp1 = 4.0*n - m;
+
+		tmp = 0.5*(stencil[4] + stencil[3]) - 0.5*minmod4(tmp, tmp1, n, m);
+
+		n = stencil[6] + stencil[4] - 2.0*stencil[5];
+		tmp1 = stencil[4] + 0.5*(stencil[4] - stencil[3]) + 4.0*minmod4(4*n - m, 4*m - n, m, n)/3.0;
+
+		{
+			m = fmin(stencil[5], stencil[3]);
+			m = fmin(m, tmp);
+
+			n = stencil[4] + 4.0*(stencil[4] - stencil[5]);
+
+			n = fmin(stencil[4], n);
+			n = fmin(n, tmp1);
+
+			m = fmax(m, n);
+		}
+		{
+			tmp = fmax(stencil[4], tmp);
+			tmp = fmax(stencil[3], tmp);
+
+			n = stencil[4] + 4.0*(stencil[4] - stencil[5]);
+
+			n = fmax(stencil[4], n);
+			n = fmax(n, tmp1);
+
+			n = fmin(tmp, n);
+		}
+
+		mid_nf = mid_nf + minmod2(n - mid_nf, m - mid_nf);
+	}
+
+	return mid_nf;
+}
+
+
+__global__ void OCFD_OMP6_M_kernel(int i, int OMP6_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+
+	int ia1 = -4; int ib1 = 3;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+	    REAL tmp_r, tmp_l; 
+
+		flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job); 
+
+		if(flag != 0) tmp_r = OCFD_OMP6_kernel_M(OMP6_FLAG, &stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+
+
+__device__ REAL OCFD_weno5_kernel_P(REAL *stencil){
+	//-2 ---- 1
+
+	REAL S0 = 0.0, S1 = 0.0, S2 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp =     stencil[2] - 2.0*stencil[3] + stencil[4]; S0 += 13*tmp*tmp;
+	tmp = 3.0*stencil[2] - 4.0*stencil[3] + stencil[4]; S0 +=  3*tmp*tmp;
+
+	REAL q03 = (2.0*stencil[2] + 5.0*stencil[3] - stencil[4]);
+
+
+	tmp = stencil[1] - 2.0*stencil[2] + stencil[3]; S1 += 13*tmp*tmp;
+	tmp =                  stencil[1] - stencil[3]; S1 +=  3*tmp*tmp;
+
+	REAL q13 = (-stencil[1] + 5.0*stencil[2] + 2.0*stencil[3]);
+
+
+	tmp = stencil[0] - 2.0*stencil[1] +     stencil[2]; S2 += 13*tmp*tmp;
+	tmp = stencil[0] - 4.0*stencil[1] + 3.0*stencil[2]; S2 +=  3*tmp*tmp;
+
+	REAL q23 = (2.0*stencil[0] - 7.0*stencil[1] + 11.0*stencil[2]);
+
+
+	REAL a0 = 3.0*((12.0*ep + S1)*(12.0*ep + S1))*((12.0*ep + S2)*(12.0*ep + S2));
+	REAL a1 = 6.0*((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S2)*(12.0*ep + S2));
+	REAL a2 =     ((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S1)*(12.0*ep + S1));
+	
+	
+	tmp = (a0*q03 + a1*q13 + a2*q23)/(6.0*(a0 + a1 + a2));
+
+	return tmp;
+}
+
+
+__global__ void OCFD_weno5_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[5];
+
+	int ia1 = -2; int ib1 = 2;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+		flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job); 
+
+		if(flag != 0) tmp_r = OCFD_weno5_kernel_P(&stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+
+	}
+}
+
+
+__device__ REAL OCFD_weno5_kernel_M(REAL *stencil){
+	//-1  ----- 2
+
+	REAL S0 = 0.0, S1 = 0.0, S2 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp =     stencil[2] - 2.0*stencil[1] + stencil[0]; S0 += 13*tmp*tmp;
+	tmp = 3.0*stencil[2] - 4.0*stencil[1] + stencil[0]; S0 +=  3*tmp*tmp;
+
+	REAL q03 = (2.0*stencil[2] + 5.0*stencil[1] - stencil[0]);
+
+
+	tmp = stencil[3] - 2.0*stencil[2] + stencil[1]; S1 += 13*tmp*tmp;
+	tmp =                  stencil[3] - stencil[1]; S1 +=  3*tmp*tmp;
+
+	REAL q13 = (-stencil[3] + 5.0*stencil[2] + 2.0*stencil[1]);
+
+
+	tmp = stencil[4] - 2.0*stencil[3] +     stencil[2]; S2 += 13*tmp*tmp;
+	tmp = stencil[4] - 4.0*stencil[3] + 3.0*stencil[2]; S2 +=  3*tmp*tmp;
+
+	REAL q23 = (2.0*stencil[4] - 7.0*stencil[3] + 11.0*stencil[2]);
+
+
+	REAL a0 = 3.0*((12.0*ep + S1)*(12.0*ep + S1))*((12.0*ep + S2)*(12.0*ep + S2));
+	REAL a1 = 6.0*((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S2)*(12.0*ep + S2));
+	REAL a2 =     ((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S1)*(12.0*ep + S1));	
+	
+	tmp = (a0*q03 + a1*q13 + a2*q23)/(6.0*(a0 + a1 + a2));
+
+	return tmp;
+}
+
+
+__global__ void OCFD_weno5_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[5];
+
+	int ia1 = -2; int ib1 = 2;
+
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+	    REAL tmp_r, tmp_l; 
+
+		flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0) tmp_r = OCFD_weno5_kernel_M(&stencil[0]);
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+		if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+
+//-------------------------------------------------------------- CD6----------------------------------------------------------//
+__global__ void OCFD_dx0_CD6_kernel(cudaField pf , cudaField pfx , cudaJobPackage job){
+	// eyes on cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL x__3 = get_Field_LAP(pf, x-3, y, z);
+		REAL x__2 = get_Field_LAP(pf, x-2, y, z);
+		REAL x__1 = get_Field_LAP(pf, x-1, y, z);
+		REAL  x_1 = get_Field_LAP(pf, x+1, y, z);
+		REAL  x_2 = get_Field_LAP(pf, x+2, y, z);
+		REAL  x_3 = get_Field_LAP(pf, x+3, y, z);
+    
+		get_Field(pfx , x-LAP,y-LAP,z-LAP) = (
+				45.0*( x_1 - x__1 ) 
+				-9.0*( x_2 - x__2 )
+					+( x_3 - x__3 ) )
+			/(60.0*hx_d);
+	}
+}
+
+
+
+
+__global__ void OCFD_dy0_CD6_kernel(cudaField pf , cudaField pfy , cudaJobPackage job){
+	// eyes on cells WITH LAPs
+    unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
+    unsigned int y = blockDim.x * blockIdx.x + threadIdx.x + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL y__3 = get_Field_LAP(pf, x, y-3, z);
+		REAL y__2 = get_Field_LAP(pf, x, y-2, z);
+		REAL y__1 = get_Field_LAP(pf, x, y-1, z);
+		REAL  y_1 = get_Field_LAP(pf, x, y+1, z);
+		REAL  y_2 = get_Field_LAP(pf, x, y+2, z);
+		REAL  y_3 = get_Field_LAP(pf, x, y+3, z);
+    
+		get_Field(pfy , x-LAP,y-LAP,z-LAP) = (
+				45.0*( y_1 - y__1 ) 
+				-9.0*( y_2 - y__2 )
+					+( y_3 - y__3 ) )
+			/(60.0*hy_d);
+	}
+}
+
+
+__global__ void OCFD_dz0_CD6_kernel(cudaField pf , cudaField pfz , cudaJobPackage job){
+
+    unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
+    unsigned int y = blockDim.z * blockIdx.z + threadIdx.z + job.start.y;
+	unsigned int z = blockDim.x * blockIdx.x + threadIdx.x + job.start.z;
+
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL z__3 = get_Field_LAP(pf, x, y, z-3);
+		REAL z__2 = get_Field_LAP(pf, x, y, z-2);
+		REAL z__1 = get_Field_LAP(pf, x, y, z-1);
+		REAL  z_1 = get_Field_LAP(pf, x, y, z+1);
+		REAL  z_2 = get_Field_LAP(pf, x, y, z+2);
+		REAL  z_3 = get_Field_LAP(pf, x, y, z+3);
+
+        get_Field(pfz, x-LAP, y-LAP, z-LAP) = (
+            45.0*( z_1 - z__1 ) 
+            -9.0*( z_2 - z__2 )
+                +( z_3 - z__3 ) )
+            /(60.0*hz_d);
+    }
+}
+
+//__global__ void OCFD_dz0_CD6_kernel(cudaField pf , cudaField pfz , cudaJobPackage job){
+//	// eyes on cells WITH LAPs
+//	extern __shared__ REAL hh[];
+//	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+//	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+//	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+//	unsigned int id = threadIdx.z + (blockDim.z + 6)* (threadIdx.x + threadIdx.y * blockDim.x);
+//
+//	if(threadIdx.z < 6)	hh[id] = get_Field_LAP(pf, x, y, z-3);
+//
+//	if(x < job.end.x && y < job.end.y && z < job.end.z){
+//
+//		hh[id + 6] = get_Field_LAP(pf, x, y, z+3);
+//		__syncthreads();
+//
+//		get_Field(pfz , x-LAP,y-LAP,z-LAP) = (
+//				45.0*( hh[id + 4] - hh[id + 2] ) 
+//				-9.0*( hh[id + 5] - hh[id + 1] )
+//					+( hh[id + 6] - hh[id    ] ) )
+//			/(60.0*hz_d);
+//	}
+//}
+//===================================================================================================================================//
+
+
+//-----------------------------------------------------------------CD8---------------------------------------------------------------//
+__global__ void OCFD_dx0_CD8_kernel(cudaField pf , cudaField pfx , cudaJobPackage job){
+	// eyes on cells WITH LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL x__4 = get_Field_LAP(pf, x-4, y, z);
+		REAL x__3 = get_Field_LAP(pf, x-3, y, z);
+		REAL x__2 = get_Field_LAP(pf, x-2, y, z);
+		REAL x__1 = get_Field_LAP(pf, x-1, y, z);
+		REAL  x_1 = get_Field_LAP(pf, x+1, y, z);
+		REAL  x_2 = get_Field_LAP(pf, x+2, y, z);
+		REAL  x_3 = get_Field_LAP(pf, x+3, y, z);
+		REAL  x_4 = get_Field_LAP(pf, x+4, y, z);
+    
+		get_Field(pfx, x-LAP, y-LAP, z-LAP) = (
+				672.*( x_1 - x__1 ) 
+				-168*( x_2 - x__2 )
+				+32.*( x_3 - x__3 ) 
+				  -3*( x_4 - x__4 ) )
+			/(840.0*hx_d);
+	}
+}
+
+
+__global__ void OCFD_dy0_CD8_kernel(cudaField pf , cudaField pfy , cudaJobPackage job){
+	// eyes on cells WITH LAPs
+    unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
+    unsigned int y = blockDim.x * blockIdx.x + threadIdx.x + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		REAL y__4 = get_Field_LAP(pf, x, y-4, z);
+		REAL y__3 = get_Field_LAP(pf, x, y-3, z);
+		REAL y__2 = get_Field_LAP(pf, x, y-2, z);
+		REAL y__1 = get_Field_LAP(pf, x, y-1, z);
+		REAL  y_1 = get_Field_LAP(pf, x, y+1, z);
+		REAL  y_2 = get_Field_LAP(pf, x, y+2, z);
+		REAL  y_3 = get_Field_LAP(pf, x, y+3, z);
+		REAL  y_4 = get_Field_LAP(pf, x, y+4, z);
+    
+		get_Field(pfy , x-LAP,y-LAP,z-LAP) = (
+				672.0*( y_1 - y__1 ) 
+			   -168.0*( y_2 - y__2 )
+				 +32.*( y_3 - y__3 ) 
+				  -3.*( y_4 - y__4 ) )
+			/(840.0*hy_d);
+	}
+}
+
+__global__ void OCFD_dz0_CD8_kernel(cudaField pf , cudaField pfz , cudaJobPackage job){
+	// eyes on cells WITH LAPs	
+    unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
+    unsigned int y = blockDim.z * blockIdx.z + threadIdx.z + job.start.y;
+	unsigned int z = blockDim.x * blockIdx.x + threadIdx.x + job.start.z;
+
+    if(x < job.end.x && y < job.end.y && z<job.end.z){
+		REAL z__4 = get_Field_LAP(pf, x, y, z-4);
+		REAL z__3 = get_Field_LAP(pf, x, y, z-3);
+		REAL z__2 = get_Field_LAP(pf, x, y, z-2);
+		REAL z__1 = get_Field_LAP(pf, x, y, z-1);
+		REAL  z_1 = get_Field_LAP(pf, x, y, z+1);
+		REAL  z_2 = get_Field_LAP(pf, x, y, z+2);
+		REAL  z_3 = get_Field_LAP(pf, x, y, z+3);
+		REAL  z_4 = get_Field_LAP(pf, x, y, z+4);
+
+        get_Field(pfz , x-LAP,y-LAP,z-LAP) = (
+                672.0*( z_1 - z__1 ) 
+               -168.0*( z_2 - z__2 )
+                 +32.*( z_3 - z__3 ) 
+                  -3.*( z_4 - z__4 ) )
+            /(840.0*hz_d);
+	}
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_Schemes_Choose.cu
+++ b/src/OCFD_Schemes_Choose.cu
+#include <math.h>
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_Schemes.h"
+#include "OCFD_Schemes_Choose.h"
+#include "OCFD_Schemes_hybrid_auto.h"
+#include "OCFD_bound_Scheme.h"
+#include "OCFD_flux_charteric.h"
+
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+
+#ifdef __cplusplus 
+extern "C"{
+#endif
+
+// Used in viscous Jacobian --------------------------------------------------------------------------------------------
+void OCFD_dx0(cudaField pf, cudaField pfx, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+
+	dim3 flagxyzb(1, 0, 0);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	cal_grid_block_dim(&griddim, &blockdim, 8, 8, 4, size.x, size.y, size.z);
+
+	switch(Scheme_vis_ID){
+		case 203:
+		CUDA_LAUNCH((OCFD_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfx, job_in) ));
+		break;
+
+		case 204:
+		CUDA_LAUNCH((OCFD_CD8_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfx, job_in) ));
+		break;
+	}
+
+}
+
+
+void OCFD_dy0(cudaField pf, cudaField pfy, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+
+	dim3 flagxyzb(2, 0, 0);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z );
+
+	switch(Scheme_vis_ID){
+		case 203:
+		CUDA_LAUNCH((OCFD_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfy, job_in) ));
+		break;
+
+		case 204:
+		CUDA_LAUNCH((OCFD_CD8_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfy, job_in) ));
+		break;
+	}
+
+}
+
+void OCFD_dz0(cudaField pf, cudaField pfz, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+
+	dim3 flagxyzb(3, 0, 0);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x, blockdim_in.y, blockdim_in.z, size.x, size.z, size.y );
+
+	switch(Scheme_vis_ID){
+		case 203:
+		CUDA_LAUNCH((OCFD_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfz, job_in) ));
+		break;
+
+		case 204:
+		CUDA_LAUNCH((OCFD_CD8_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfz, job_in) ));
+		break;
+	}
+
+}
+
+
+// Used in inviscous Jacobian flux+ ------------------------------------------------------------------------------------------
+void OCFD_dx1(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc, 
+	cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+// field with LAPs
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x-1, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z );
+
+	dim3 flagxyzb(1, 0, Non_ref[0]);//.x正向边界；.y负向边界；.z无反射边界
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	job_in.start.x  -= 1;
+    blockdim.x += 1;
+
+	if(IF_CHARTERIC == 1){
+	    switch(Scheme_invis_ID){
+	    	case 301:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 302:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 303:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 0, *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 0, *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 307:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 308:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 309:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_P_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_P_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
+			}
+	    	break;
+	    }
+	}else{
+        for(int i=0; i<5; i++){
+		switch(Scheme_invis_ID){
+	    	case 301:
+	    	CUDA_LAUNCH((OCFD_UP7_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 302:
+	    	CUDA_LAUNCH((OCFD_weno5_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 303:
+	    	CUDA_LAUNCH((OCFD_weno7_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH(( OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+	    	CUDA_LAUNCH((OCFD_NND2_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 307:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 308:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 309:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_P_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
+			}
+			break;
+	    }
+        }
+	}
+
+}
+
+
+void OCFD_dy1(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc, 
+	cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.y, size.z);
+
+	dim3 flagxyzb(2, 0, Non_ref[2]);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+    job_in.start.y  -= 1;
+    blockdim.y += 1;
+
+	if(IF_CHARTERIC == 1){
+	    switch(Scheme_invis_ID){
+	    	case 301:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 302:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 303:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 307:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 308:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 309:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
+			}else if(HybridAuto.Style == 2){
+				CUDA_LAUNCH((OCFD_HybridAuto_character_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
+			}
+			break;
+	    }
+	}else{
+        for(int i=0; i<5; i++){
+		switch(Scheme_invis_ID){
+	    	case 301:
+	    	CUDA_LAUNCH((OCFD_UP7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 302:
+	    	CUDA_LAUNCH((OCFD_weno5_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 303:
+	    	CUDA_LAUNCH((OCFD_weno7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH(( OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+	    	CUDA_LAUNCH((OCFD_NND2_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 307:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 308:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 309:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
+			}
+			break;
+	    }
+        }
+	}
+}
+
+
+// Used in inviscous Jacobian flux- ------------------------------------------------------------------------------------------
+void OCFD_dz1(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc, 
+	cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.z, size.y);
+
+	dim3 flagxyzb(3, 0, Non_ref[4]);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+    job_in.start.z  -= 1;
+    blockdim.y += 1;
+
+	if(IF_CHARTERIC == 1){
+	    switch(Scheme_invis_ID){
+	    	case 301:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 302:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 303:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 307:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 308:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 309:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
+	    	}else if(HybridAuto.Style == 2){
+				CUDA_LAUNCH((OCFD_HybridAuto_character_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
+			}
+			break;
+	    }
+	}else{
+        for(int i=0; i<5; i++){
+		switch(Scheme_invis_ID){
+	    	case 301:
+	    	CUDA_LAUNCH((OCFD_UP7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 302:
+	    	CUDA_LAUNCH((OCFD_weno5_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+
+	    	case 303:
+	    	CUDA_LAUNCH((OCFD_weno7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH(( OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+	    	CUDA_LAUNCH((OCFD_NND2_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 307:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 308:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 309:
+	    	CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
+			}
+			break;
+	    }
+        }
+	}
+}
+
+
+void OCFD_dx2(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc, 
+	cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+// field with LAPs
+		
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x-1, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z );
+
+	dim3 flagxyzb(4, 0, Non_ref[1]);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	job_in.end.x  += 1;
+    blockdim.x += 1;
+
+	if(IF_CHARTERIC == 1){
+	    switch(Scheme_invis_ID){
+	    	case 301:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 302:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 303:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 0, *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 0, *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 307:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 308:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 309:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_M_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
+			}else if(HybridAuto.Style == 2){
+				CUDA_LAUNCH((OCFD_HybridAuto_character_M_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
+			}
+			break;
+	    }
+	}else{
+        for(int i=0; i<5; i++){
+		switch(Scheme_invis_ID){
+	    	case 301:
+	    	CUDA_LAUNCH((OCFD_UP7_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 302:
+	    	CUDA_LAUNCH((OCFD_weno5_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 303:
+	    	CUDA_LAUNCH((OCFD_weno7_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+	    	CUDA_LAUNCH((OCFD_NND2_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 307:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 308:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 309:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_M_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
+			}
+			break;
+	    }
+        }
+	}
+}
+
+
+void OCFD_dy2(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc, 
+	cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+
+	dim3 size;
+	jobsize(&job_in , &size);
+	dim3 griddim , blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.y, size.z);
+
+	dim3 flagxyzb(5, 0, Non_ref[3]);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	job_in.end.y  += 1;
+    blockdim.y += 1;
+
+	if(IF_CHARTERIC == 1){
+	    switch(Scheme_invis_ID){
+	    	case 301:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 302:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 303:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 307:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 308:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 309:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
+			}else if(HybridAuto.Style == 2){
+				CUDA_LAUNCH((OCFD_HybridAuto_character_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
+			}
+	    	break;
+	    }
+	}else{
+        for(int i=0; i<5; i++){
+		switch(Scheme_invis_ID){
+	    	case 301:
+	    	CUDA_LAUNCH((OCFD_UP7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 302:
+	    	CUDA_LAUNCH((OCFD_weno5_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 303:
+	    	CUDA_LAUNCH((OCFD_weno7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+	    	CUDA_LAUNCH((OCFD_NND2_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 307:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 308:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 309:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
+			}
+			break;
+	    }
+        }
+	}
+}
+
+
+void OCFD_dz2(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc, 
+	cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
+	dim3 size;
+	jobsize(&job_in , &size);
+	dim3 griddim , blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.z, size.y);
+
+	dim3 flagxyzb(6, 0, Non_ref[5]);
+	OCFD_bound(&flagxyzb, boundl, boundr, job_in);
+
+	job_in.end.z  += 1;
+    blockdim.y += 1;
+
+	if(IF_CHARTERIC == 1){
+	    switch(Scheme_invis_ID){
+	    	case 301:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 302:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+
+	    	case 303:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 307:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 308:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 309:
+            if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
+			}else if(HybridAuto.Style == 2){
+				CUDA_LAUNCH((OCFD_HybridAuto_character_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
+			}
+			break;
+	    }
+	}else{
+        for(int i=0; i<5; i++){
+		switch(Scheme_invis_ID){
+	    	case 301:
+	    	CUDA_LAUNCH((OCFD_UP7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 302:
+	    	CUDA_LAUNCH((OCFD_weno5_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+
+	    	case 303:
+	    	CUDA_LAUNCH((OCFD_weno7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 304:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
+	    	break;
+    
+	    	case 305:
+	    	CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
+	    	break;
+    
+	    	case 306:
+	    	CUDA_LAUNCH((OCFD_NND2_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 307:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 308:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 309:
+	    	CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
+	    	break;
+    
+	    	case 310:
+			if(HybridAuto.Style == 1){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
+			}else if(HybridAuto.Style == 2){
+	    		CUDA_LAUNCH((OCFD_HybridAuto_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
+			}
+			break;
+	    }
+        }
+	}
+}
+
+
+void OCFD_dx0_jac(cudaField pf, cudaField pfx, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int bound){
+// field with LAPs
+	
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z);
+	
+	CUDA_LAUNCH(( OCFD_dx0_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(pf, pfx, job_in) ));
+
+	if(bound != 0){
+		OCFD_Dx0_bound(pf, pfx, job_in, blockdim_in, stream);
+	}
+}
+	
+void OCFD_dy0_jac(cudaField pf, cudaField pfy, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int bound){
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.y, blockdim_in.x, blockdim_in.z, size.y, size.x, size.z );
+
+	CUDA_LAUNCH(( OCFD_dy0_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(pf, pfy, job_in) ));
+	
+	if(bound != 0){
+		OCFD_Dy0_bound(pf, pfy, job_in, blockdim_in, stream);
+	}
+}
+	
+void OCFD_dz0_jac(cudaField pf, cudaField pfz, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int bound){
+	dim3 size;
+	jobsize(&job_in, &size);
+	dim3 griddim, blockdim;
+	cal_grid_block_dim(&griddim, &blockdim, blockdim_in.z, blockdim_in.x, blockdim_in.y, size.z, size.x, size.y );
+	
+	CUDA_LAUNCH(( OCFD_dz0_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(pf, pfz, job_in) ));
+	
+	if(bound != 0){
+		OCFD_Dz0_bound(pf, pfz, job_in, blockdim_in, stream);
+	}
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_Schemes_hybrid_auto.cu
+++ b/src/OCFD_Schemes_hybrid_auto.cu
+#include <math.h>
+#include "parameters.h"
+#include "cuda_commen.h"
+#include "commen_kernel.h"
+
+#include "parameters_d.h"
+#include "OCFD_warp_shuffle.h"
+#include "cuda_utility.h"
+#include "OCFD_Schemes_hybrid_auto.h"
+#include "OCFD_Schemes_Choose.h"
+#include "OCFD_bound_Scheme.h"
+#include "OCFD_Schemes.h"
+#include "OCFD_mpi_dev.h"
+#include "OCFD_mpi.h"
+#include "OCFD_IO_mpi.h"
+
+#ifdef __cplusplus 
+extern "C"{
+#endif
+
+void Set_Scheme_HybridAuto(cudaStream_t *stream){
+    Comput_P(pd_d, pT_d, pP_d, stream);
+
+    if(HybridAuto.Style == 1){
+
+        Comput_grad(pP_d, stream);
+
+        modify_NT(stream);
+
+        if(HybridAuto.IF_Smooth_dp == 1) Smoothing_dp(stream);
+
+        Patch_zones(stream);
+
+        Boundary_dp(stream);
+
+        Comput_Scheme_point(stream);
+
+    }else if(HybridAuto.Style == 2){
+
+        Comput_Scheme_point_Jameson(stream);
+
+    }
+
+}
+
+
+//---------------------------------------------------Comput_P--------------------------------------------------------
+__global__ void Comput_P_kernel(cudaField d, cudaField T, cudaField P, REAL p00, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(P, x, y, z) = p00 * get_Field_LAP(d, x, y, z) * get_Field_LAP(T, x, y, z);
+    }
+}
+
+void Comput_P(cudaField *d, cudaField *T, cudaField *P, cudaStream_t *stream){
+    cudaJobPackage job(dim3(0, 0, 0) , dim3(nx+2*LAP, ny+2*LAP, nz+2*LAP));
+
+    dim3 size, griddim, blockdim;
+	jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    REAL p00 = 1.0/(Gamma*Ama*Ama);
+    
+    CUDA_LAUNCH((Comput_P_kernel<<<griddim, blockdim, 0, *stream>>>(*d, *T, *P, p00, job)));
+}
+//--------------------------------------------------------------------------------------------------------------------
+
+
+//----------------------------------------------------Comput_grad_P----------------------------------------------------
+__device__ REAL warpReduce(REAL mySum){
+    mySum += __shfl_xor_double(mySum, 32, warpSize);
+    mySum += __shfl_xor_double(mySum, 16, warpSize);
+    mySum += __shfl_xor_double(mySum,  8, warpSize);
+    mySum += __shfl_xor_double(mySum,  4, warpSize);
+    mySum += __shfl_xor_double(mySum,  2, warpSize);
+    mySum += __shfl_xor_double(mySum,  1, warpSize);
+    return mySum;
+}
+
+__global__ void Comput_grad1_kernel(cudaField pk, cudaField pi, cudaField ps, cudaField Akx, cudaField Aky, 
+    cudaField Akz, cudaField Aix, cudaField Aiy, cudaField Aiz, cudaField Asx, cudaField Asy, cudaField Asz, 
+    int SMEMDIM, cudaField grad_f, REAL *g_odata, cudaJobPackage job){
+    extern __shared__ REAL shared[];
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int Id = blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
+    unsigned int warpId  = Id / warpSize;
+    unsigned int laneIdx = Id % warpSize;
+    REAL grad_f0 = 0.;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL px = get_Field(pk, x, y, z) * get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP) 
+                + get_Field(pi, x, y, z) * get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)
+                + get_Field(ps, x, y, z) * get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+
+        REAL py = get_Field(pk, x, y, z) * get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP) 
+                + get_Field(pi, x, y, z) * get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)
+                + get_Field(ps, x, y, z) * get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+
+        REAL pz = get_Field(pk, x, y, z) * get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP) 
+                + get_Field(pi, x, y, z) * get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)
+                + get_Field(ps, x, y, z) * get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+
+        get_Field(grad_f, x, y, z) = grad_f0 = sqrt(px*px + py*py + pz*pz);
+    }
+
+    grad_f0 = warpReduce(grad_f0);
+
+    if(laneIdx == 0) shared[warpId] = grad_f0;
+    __syncthreads();
+
+    grad_f0 = (Id < SMEMDIM)?shared[Id]:0;
+
+    if(warpId == 0) grad_f0 = warpReduce(grad_f0);
+    if(Id == 0) g_odata[blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z] = grad_f0;
+}
+
+__global__ void add_kernel(REAL *g_odata, int g_odata_size){
+    extern __shared__ REAL shared[];
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int warpId  = threadIdx.x / warpSize;
+    unsigned int laneIdx = threadIdx.x % warpSize;
+
+    REAL grad_f0 = 0.;
+    if(x < g_odata_size) grad_f0 = g_odata[x];
+
+    grad_f0 = warpReduce(grad_f0);
+    if(laneIdx == 0) shared[warpId] = grad_f0;
+    __syncthreads();
+
+    grad_f0 = (threadIdx.x < 8)?shared[laneIdx]:0;
+
+    if(warpId == 0) grad_f0 = warpReduce(grad_f0);
+
+    if(x >= gridDim.x) g_odata[x] = 0.0;
+
+    if(threadIdx.x == 0) g_odata[blockIdx.x] = grad_f0;
+}
+
+__global__ void Comput_grad2_kernel(cudaField grad_f, REAL grad_f_av1, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field(grad_f, x, y, z) = get_Field(grad_f, x, y, z)*grad_f_av1;
+    }
+}
+
+void Comput_grad(cudaField *P, cudaStream_t *stream){
+    cudaField Pk_d, Pi_d, Ps_d;
+
+    Pk_d.pitch = pdu_d->pitch; Pk_d.ptr = pdu_d->ptr;
+    Pi_d.pitch = pdu_d->pitch; Pi_d.ptr = pdu_d->ptr + pdu_d->pitch*ny*nz;
+    Ps_d.pitch = pdu_d->pitch; Ps_d.ptr = pdu_d->ptr + 2 * pdu_d->pitch*ny*nz;
+    grad_P.pitch = pdu_d->pitch; grad_P.ptr = pdu_d->ptr + 3 * pdu_d->pitch*ny*nz;
+
+    cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
+
+    OCFD_dx0(*P, Pk_d, job, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
+    OCFD_dy0(*P, Pi_d, job, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
+    OCFD_dz0(*P, Ps_d, job, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
+
+
+    dim3 size, griddim, blockdim;
+    job.setup(dim3(0, 0, 0), dim3(nx, ny, nz));
+	jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    REAL *g_odata;
+    REAL *Sum = (REAL *)malloc(sizeof(REAL));
+
+    unsigned int g_odata_size = griddim.x*griddim.y*griddim.z;
+    CUDA_LAUNCH(( cudaMalloc((REAL **)&g_odata, g_odata_size*sizeof(REAL)) ));
+
+    int SMEMDIM = blockdim.x*blockdim.y*blockdim.z/64;   //Warpsize is 64
+    CUDA_LAUNCH((Comput_grad1_kernel<<<griddim, blockdim, SMEMDIM, *stream>>>(Pk_d, Pi_d, Ps_d, *pAkx_d, *pAky_d, 
+        *pAkz_d, *pAix_d, *pAiy_d, *pAiz_d, *pAsx_d, *pAsy_d, *pAsz_d, SMEMDIM, grad_P, g_odata, job)));
+
+    dim3 blockdim_sum(512);
+    dim3 griddim_sum(g_odata_size); 
+
+    do{
+        griddim_sum.x = (griddim_sum.x + blockdim_sum.x - 1)/blockdim_sum.x;
+        CUDA_LAUNCH(( add_kernel<<<griddim_sum, blockdim_sum, 8, *stream>>>(g_odata, g_odata_size) ));
+    } while(griddim_sum.x > 1);
+
+    CUDA_LAUNCH(( cudaMemcpy(Sum, g_odata, sizeof(REAL), cudaMemcpyDeviceToHost) ));
+    CUDA_LAUNCH(( cudaFree(g_odata) ));
+
+    REAL grad_f_av, grad_f_av1;
+
+    MPI_Allreduce(Sum, &grad_f_av, 1, OCFD_DATA_TYPE, MPI_SUM, MPI_COMM_WORLD);
+
+    grad_f_av = grad_f_av/(NX_GLOBAL * NY_GLOBAL * NZ_GLOBAL);
+    grad_f_av1 = 1.0/grad_f_av;
+
+    CUDA_LAUNCH((Comput_grad2_kernel<<<griddim, blockdim, 0, *stream>>>(grad_P, grad_f_av1, job)));
+}
+//----------------------------------------------------------------------------------------------------------
+
+//---------------------------------------------Modify Negative T--------------------------------------------
+__global__ void ana_NT_kernel(cudaField T, cudaField grad_f, REAL P_intvs, cudaJobPackage job){
+    // field with LAP
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL t;
+        t = get_Field_LAP(T , x,y,z);
+        if(t < 0){
+            t = get_Field_LAP(T , x-1 , y , z) + get_Field_LAP(T , x+1 , y , z)
+               +get_Field_LAP(T , x , y-1 , z) + get_Field_LAP(T , x , y+1 , z)
+               +get_Field_LAP(T , x , y , z-1) + get_Field_LAP(T , x , y , z+1);
+            get_Field_LAP(T , x,y,z) = t/6.0;
+
+            get_Field(grad_f, x, y, z) = fmax(10.*get_Field_LAP(grad_f, x, y, z), P_intvs + 1.);
+        }
+	}
+}
+
+
+void modify_NT(cudaStream_t *stream){
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
+    cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
+    ana_NT_kernel<<<griddim , blockdim, 0, *stream>>>(*pT_d, grad_P,  HybridAuto.P_intvs[1], job);
+}
+
+
+
+//----------------------------------------------Smoothing_dp------------------------------------------------
+__global__ void Modify_P_kernel(cudaField f, cudaField grad_f, REAL P_intvs, cudaJobPackage job){
+    unsigned int x = (blockDim.x * blockIdx.x + threadIdx.x) + job.start.x;
+	unsigned int y = (blockDim.y * blockIdx.y + threadIdx.y) + job.start.y;
+    unsigned int z = (blockDim.z * blockIdx.z + threadIdx.z) + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL ff;
+        get_Field_LAP(f, x, y, z) = ff = get_Field(grad_f, x-LAP, y-LAP, z-LAP);
+        if(ff >= P_intvs) get_Field_LAP(f, x, y, z) = 3*ff;
+    }
+}
+
+__global__ void Modify_grad_inner_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field(grad_f, x-LAP, y-LAP, z-LAP) = get_Field_LAP(f, x, y, z)/3.0 
+                            + (get_Field_LAP(f, x+1, y, z) + get_Field_LAP(f, x-1, y, z)
+                            +  get_Field_LAP(f, x, y+1, z) + get_Field_LAP(f, x, y-1, z)
+                            +  get_Field_LAP(f, x, y, z+1) + get_Field_LAP(f, x, y, z-1))/9.0;
+    }
+}
+
+__global__ void Modify_grad_outer_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field(grad_f, x, y, z) = get_Field_LAP(f, x+LAP, y+LAP, z+LAP); 
+    }
+}
+
+void Smoothing_dp(cudaStream_t *stream){
+    REAL P_intvs = HybridAuto.P_intvs[HybridA_Stage - 1];
+
+    cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
+    dim3 size, griddim, blockdim;
+
+    jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+    CUDA_LAUNCH(( Modify_P_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, P_intvs, job) ));
+
+    exchange_boundary_xyz_Async_packed_dev(pP, pPP_d, stream);
+
+    CUDA_LAUNCH(( Modify_grad_inner_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job) ))
+
+	//---------------------------------------------------------------------------------------------
+	if (npx == 0 && Iperiodic[0] == 0)
+	{	
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(0, 0, 0), dim3(1, ny, nz));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
+        CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
+	}
+	if (npx == NPX0 - 1 && Iperiodic[0] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(nx-1, 0, 0), dim3(nx, ny, nz));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
+        CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
+	}
+	//---------------------------------------------------------------------------------------------
+	if (npy == 0 && Iperiodic[1] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(0, 0, 0), dim3(nx, 1, nz));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
+        CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
+	}
+
+	if (npy == NPY0 - 1 && Iperiodic[1] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(0, ny-1, 0), dim3(nx, ny, nz));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
+        CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
+	}
+	//----------------------------------------------------------------------------------------------
+	if (npz == 0 && Iperiodic[2] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(0, 0, 0), dim3(nx, ny, 1));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
+        CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
+	}
+
+	if (npz == NPZ0 - 1 && Iperiodic[2] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(0, 0, nz-1), dim3(nx, ny, nz));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
+        CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
+	}
+}
+//-------------------------------------------------------------------------------------------------------------------
+
+
+//----------------------------------------------------Patch_zones----------------------------------------------------
+__global__ void Patch_zones_kernel(cudaField grad_f, REAL Pa_zones, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field(grad_f, x, y, z) = Pa_zones;
+    }
+}
+
+void Patch_zones(cudaStream_t *stream){
+    int node_ib, node_ie, node_jb, node_je, node_kb, node_ke;
+    int ib, ie, jb, je, kb, ke;
+
+    for(int i = 0; i < HybridAuto.Num_Patch_zones; i++){
+        int (*HybridAuto_zones)[6] = (int(*)[6])HybridAuto.zones;
+        int flag_i = 1, flag_j = 1, flag_k = 1;
+
+        get_i_node(HybridAuto_zones[i][0], &node_ib, &ib);
+        get_i_node(HybridAuto_zones[i][1], &node_ie, &ie);
+        get_j_node(HybridAuto_zones[i][2], &node_jb, &jb);
+        get_j_node(HybridAuto_zones[i][3], &node_je, &je);
+        get_k_node(HybridAuto_zones[i][4], &node_kb, &kb);
+        get_k_node(HybridAuto_zones[i][5], &node_ke, &ke);
+
+        if(node_ib < npx) ib = 0;
+        if(node_ib > npx) flag_i = 0;
+        if(node_ie > npx) ie = nx;
+        if(node_ie < npx) flag_i = 0;
+
+        if(node_jb < npy) jb = 0;
+        if(node_jb > npy) flag_j = 0;
+        if(node_je > npy) je = ny;
+        if(node_je < npy) flag_j = 0;
+
+        if(node_kb < npz) kb = 0;
+        if(node_kb > npz) flag_k = 0;
+        if(node_ke > npz) ke = nz;
+        if(node_ke < npz) flag_k = 0;
+
+        if(flag_i*flag_j*flag_k != 0){
+            cudaJobPackage job(dim3(ib, jb, kb) , dim3(ie, je, ke));
+
+            dim3 size, griddim, blockdim;
+            jobsize(&job, &size);
+            cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+            REAL Pa_zones = HybridAuto.Pa_zones[i];
+            CUDA_LAUNCH(( Patch_zones_kernel<<<griddim, blockdim, 0, *stream>>>(grad_P, Pa_zones, job) ));
+        }
+    }
+}
+//---------------------------------------------------------------------------------------------------------------
+
+
+//----------------------------------------------Boundary_dp-----------------------------------------------------
+__global__ void Modify_P_all_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + f.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        get_Field_LAP(f, x, y, z, offset) = get_Field(grad_f, x, y, z);
+    }
+}
+
+__global__ void Modify_x_P_outer_kernel(cudaField f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(f, x-1, y, z) = get_Field_LAP(f, x, y, z); 
+    }
+}
+
+__global__ void Modify_x_M_outer_kernel(cudaField f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(f, x+1, y, z) = get_Field_LAP(f, x, y, z); 
+    }
+}
+
+__global__ void Modify_y_P_outer_kernel(cudaField f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(f, x, y-1, z) = get_Field_LAP(f, x, y, z); 
+    }
+}
+
+__global__ void Modify_y_M_outer_kernel(cudaField f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(f, x, y+1, z) = get_Field_LAP(f, x, y, z); 
+    }
+}
+
+__global__ void Modify_z_P_outer_kernel(cudaField f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(f, x, y, z-1) = get_Field_LAP(f, x, y, z); 
+    }
+}
+
+__global__ void Modify_z_M_outer_kernel(cudaField f, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(f, x, y, z+1) = get_Field_LAP(f, x, y, z); 
+    }
+}
+
+void Boundary_dp(cudaStream_t *stream){
+    cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
+    dim3 size, griddim, blockdim;
+
+    jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+    CUDA_LAUNCH(( Modify_P_all_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job) ));
+
+    exchange_boundary_xyz_Async_packed_dev(pP, pPP_d, stream);
+
+    //---------------------------------------------------------------------------------------------
+	if (npx == 0 && Iperiodic[0] == 0)
+	{	
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(LAP+1, ny+LAP, nz+LAP));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
+        CUDA_LAUNCH(( Modify_x_P_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
+	}
+	if (npx == NPX0 - 1 && Iperiodic[0] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(nx+LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
+        CUDA_LAUNCH(( Modify_x_M_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
+	}
+	//---------------------------------------------------------------------------------------------
+	if (npy == 0 && Iperiodic[1] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(nx+LAP, LAP+1, nz+LAP));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
+        CUDA_LAUNCH(( Modify_y_P_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
+	}
+
+	if (npy == NPY0 - 1 && Iperiodic[1] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(LAP, ny+LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
+        CUDA_LAUNCH(( Modify_y_M_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
+	}
+	//----------------------------------------------------------------------------------------------
+	if (npz == 0 && Iperiodic[2] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(nx+LAP, ny+LAP, LAP+1));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
+        CUDA_LAUNCH(( Modify_z_P_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
+	}
+
+	if (npz == NPZ0 - 1 && Iperiodic[2] == 0)
+	{
+		dim3 griddim, blockdim;
+		cudaJobPackage job_outer(dim3(LAP, LAP, nz+LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+		cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
+        CUDA_LAUNCH(( Modify_z_M_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
+	}
+}
+//---------------------------------------------------------------------------------------------------------------------
+
+
+//-----------------------------------------------Comput_Scheme_point---------------------------------------------------
+__global__ void Comput_Scheme_point_x_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
+        int kp = 1;
+        if(dp0 > P_intvs1) kp += 1;
+        if(dp0 > P_intvs2) kp += 1;
+        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
+    }
+}
+
+__global__ void Comput_Scheme_point_y_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
+        int kp = 1;
+        if(dp0 > P_intvs1) kp += 1;
+        if(dp0 > P_intvs2) kp += 1;
+        *(scheme.ptr + (x + scheme.pitch *(y + (z)*(ny_d + 1)))) = kp;
+    }
+}
+
+__global__ void Comput_Scheme_point_z_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
+        int kp = 1;
+        if(dp0 > P_intvs1) kp += 1;
+        if(dp0 > P_intvs2) kp += 1;
+        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
+    }
+}
+
+void Comput_Scheme_point(cudaStream_t *stream){
+
+    dim3 size, griddim, blockdim;
+    cudaJobPackage job(dim3(LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+	jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+    
+    CUDA_LAUNCH(( Comput_Scheme_point_x_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, *HybridAuto.scheme_x, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
+
+    job.setup(dim3(LAP, LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+    jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    CUDA_LAUNCH(( Comput_Scheme_point_y_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, *HybridAuto.scheme_y, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
+
+    job.setup(dim3(LAP, LAP, LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+    jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    CUDA_LAUNCH(( Comput_Scheme_point_z_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, *HybridAuto.scheme_z, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
+}
+
+
+__global__ void Comput_Scheme_point_x_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
+        (get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
+
+        dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
+        (get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
+
+        dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
+        (get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
+
+        int kp = 1;
+        if(dp0 > P_intvs1) kp += 1;
+        if(dp0 > P_intvs2) kp += 1;
+        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
+    }
+}
+
+__global__ void Comput_Scheme_point_y_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
+        (get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
+
+        dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
+        (get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
+
+        dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
+        (get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
+
+        int kp = 1;
+        if(dp0 > P_intvs1) kp += 1;
+        if(dp0 > P_intvs2) kp += 1;
+        *(scheme.ptr + (x + scheme.pitch *(y + (z)*(ny_d + 1)))) = kp;
+    }
+}
+
+__global__ void Comput_Scheme_point_z_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
+    
+    if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
+        REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
+        (get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
+
+        dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
+        (get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
+
+        dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
+        (get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
+
+        int kp = 1;
+        if(dp0 > P_intvs1) kp += 1;
+        if(dp0 > P_intvs2) kp += 1;
+        *(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
+    }
+}
+
+void Comput_Scheme_point_Jameson(cudaStream_t *stream){
+
+    dim3 size, griddim, blockdim;
+    cudaJobPackage job(dim3(LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+	jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+    
+    CUDA_LAUNCH(( Comput_Scheme_point_x_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(*pP_d, *HybridAuto.scheme_x, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
+
+    job.setup(dim3(LAP, LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+    jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    CUDA_LAUNCH(( Comput_Scheme_point_y_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(*pP_d, *HybridAuto.scheme_y, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
+
+    job.setup(dim3(LAP, LAP, LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));
+
+    jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    CUDA_LAUNCH(( Comput_Scheme_point_z_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(*pP_d, *HybridAuto.scheme_z, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
+}
+
+
+void HybridAuto_scheme_IO(){
+    memcpy_All_int(scheme_x, HybridAuto.scheme_x->ptr, HybridAuto.scheme_x->pitch, D2H, nx+1, ny, nz);
+    memcpy_All_int(scheme_y, HybridAuto.scheme_y->ptr, HybridAuto.scheme_y->pitch, D2H, nx, ny+1, nz);
+    memcpy_All_int(scheme_z, HybridAuto.scheme_z->ptr, HybridAuto.scheme_z->pitch, D2H, nx, ny, nz+1);
+    memcpy_All(pP, pPP_d->ptr , pPP_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+
+    FILE *fp; 
+    char fp_name[120];
+
+    if(my_id == 0){
+        sprintf(fp_name, "Scheme_x%08d.dat", Istep);
+        fp = fopen(fp_name, "w");
+        fprintf(fp, "variables=scheme\n");
+        fprintf(fp, "zone i=%d, j=%d\n", NX_GLOBAL, NY_GLOBAL);
+    }
+    write_2d_XY(fp, NZ_GLOBAL/2, nx+1, ny, 0, scheme_x, pP);
+
+    if(my_id == 0){
+        fclose(fp);
+        sprintf(fp_name, "Scheme_y%08d.dat", Istep);
+        fp = fopen(fp_name, "w");
+        fprintf(fp, "variables=scheme\n");
+        fprintf(fp, "zone i=%d, j=%d\n", NX_GLOBAL, NY_GLOBAL);
+    }
+    write_2d_XY(fp, NZ_GLOBAL/2, nx, ny+1, 0, scheme_y, pP);
+    
+    if(my_id == 0){
+        fclose(fp);
+        sprintf(fp_name, "Scheme_z%08d.dat", Istep);
+        fp = fopen(fp_name, "w");
+        fprintf(fp, "variables=scheme\n");
+        fprintf(fp, "zone i=%d, j=%d\n", NX_GLOBAL, NY_GLOBAL);
+    }
+    write_2d_XY(fp, NZ_GLOBAL/2, nx, ny, 0, scheme_z, pP);
+    
+    if(my_id == 0) fclose(fp);
+}
+
+
+void HybridAuto_scheme_Proportion(){
+    memcpy_All_int(scheme_x, HybridAuto.scheme_x->ptr, HybridAuto.scheme_x->pitch, D2H, nx+1, ny, nz);
+    memcpy_All_int(scheme_y, HybridAuto.scheme_y->ptr, HybridAuto.scheme_y->pitch, D2H, nx, ny+1, nz);
+    memcpy_All_int(scheme_z, HybridAuto.scheme_z->ptr, HybridAuto.scheme_z->pitch, D2H, nx, ny, nz+1);
+
+    double type1 = 0.0, type2 = 0.0, type3 = 0.0;
+    double Sum_type1, Sum_type2, Sum_type3;
+
+    int tmp = (nx + 1) * ny * nz;
+    for(int i = 0; i < tmp; i++){
+        if(*(scheme_x + i) == 1){ 
+            type1 += 1.0;
+        }else if(*(scheme_x + i) == 2){
+            type2 += 1.0;
+        }else{
+            type3 += 1.0;
+        }
+    }
+
+    type1 /= NY_GLOBAL*NZ_GLOBAL;
+    type2 /= NY_GLOBAL*NZ_GLOBAL;
+    type3 /= NY_GLOBAL*NZ_GLOBAL;
+
+    MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    tmp = NX_GLOBAL + NPX0;
+
+    char scheme_percent[] = "The first type scheme of Hybrid schemes in direction %s is \033[34m%lf%\033[0m, second is"
+ "\033[34m%lf%\033[0m， third is \033[34m%lf%\033[0m\n";
+
+    if(my_id == 0) printf(scheme_percent, "X", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);
+
+    type1 = 0.0; type2 = 0.0; type3 = 0.0;
+
+    tmp = nx * (ny + 1) * nz;
+    for(int i = 0; i < tmp; i++){
+        if(*(scheme_y + i) == 1){ 
+            type1 += 1.0;
+        }else if(*(scheme_y + i) == 2){
+            type2 += 1.0;
+        }else{
+            type3 += 1.0;
+        }
+    }
+
+    type1 /= NX_GLOBAL*NZ_GLOBAL;
+    type2 /= NX_GLOBAL*NZ_GLOBAL;
+    type3 /= NX_GLOBAL*NZ_GLOBAL;
+
+    MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    tmp = (NY_GLOBAL + NPY0);
+
+    if(my_id == 0) printf(scheme_percent, "Y", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);
+
+    type1 = 0.0; type2 = 0.0; type3 = 0.0;
+
+    tmp = nx * ny * (nz + 1);
+    for(int i = 0; i < tmp; i++){
+        if(*(scheme_z + i) == 1){ 
+            type1 += 1.0;
+        }else if(*(scheme_z + i) == 2){
+            type2 += 1.0;
+        }else{
+            type3 += 1.0;
+        }
+    }
+
+    type1 /= NX_GLOBAL*NY_GLOBAL;
+    type2 /= NX_GLOBAL*NY_GLOBAL;
+    type3 /= NX_GLOBAL*NY_GLOBAL;
+
+    MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+    tmp = (NZ_GLOBAL + NPZ0);
+
+    if(my_id == 0) printf(scheme_percent, "Z", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);
+
+}
+
+
+__device__ int get_Hyscheme_flag_p_kernel(int flagxyz, dim3 coords, cudaField_int scheme, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+    int Hyscheme_flag;
+
+	switch(flagxyz){
+		case 1:
+		case 4:
+        Hyscheme_flag = *(scheme.ptr + (x + 1 - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
+		return Hyscheme_flag;
+		break;
+
+	    case 2:
+	    case 5:
+        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y + 1 - LAP + (z - LAP)*(ny_d + 1))));
+	    return Hyscheme_flag;
+	    break;
+
+		case 3:
+		case 6:
+        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z + 1 - LAP)*ny_d)));
+		return Hyscheme_flag;
+		break;
+	}
+
+    return 0;
+}
+
+
+__global__ void OCFD_HybridAuto_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+    int Hyscheme_flag;
+	int ia1 = -3; int ib1 = 4;
+	
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+        if(i == 0) Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);
+
+        flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0){
+            if(Hyscheme_flag == 1){
+                tmp_r = OCFD_OMP6_kernel_P(0, &stencil[0]);
+            }else if(Hyscheme_flag == 2){
+                tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
+            }else{
+                tmp_r = OCFD_NND2_kernel_P(&stencil[2]);
+            }
+        }
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+        if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+
+
+__global__ void OCFD_HybridAuto_P_Jameson_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+    int Hyscheme_flag;
+	int ia1 = -3; int ib1 = 4;
+	
+	int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+
+	if(flag != 0){
+		REAL tmp_r, tmp_l;
+
+        if(i == 0) Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);
+
+        flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+
+		if(flag != 0){
+            if(Hyscheme_flag == 1){
+                tmp_r = OCFD_UP7_kernel_P(&stencil[0]);
+            }else if(Hyscheme_flag == 2){
+                tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
+            }else{
+                tmp_r = OCFD_weno5_kernel_P(&stencil[1]);
+            }
+        }
+
+		tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+
+        if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+	}
+}
+
+
+__device__ int get_Hyscheme_flag_m_kernel(int flagxyz, dim3 coords, cudaField_int scheme, cudaJobPackage job){
+	unsigned int x = coords.x + job.start.x;
+	unsigned int y = coords.y + job.start.y;
+	unsigned int z = coords.z + job.start.z;
+    int Hyscheme_flag;
+
+	switch(flagxyz){
+		case 1:
+		case 4:
+        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
+		return Hyscheme_flag;
+        break;
+
+	    case 2:
+	    case 5:
+        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*(ny_d + 1))));
+	    return Hyscheme_flag;
+	    break;
+
+		case 3:
+		case 6:
+        Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
+		return Hyscheme_flag;
+		break;
+	}
+
+    return 0;
+}
+
+
+__global__ void OCFD_HybridAuto_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+    int Hyscheme_flag;
+
+	int ia1 = -4; int ib1 = 3;
+
+    int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+    
+    if(flag != 0){
+        REAL tmp_r, tmp_l; 
+
+        if(i == 0) Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
+    
+        flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+    
+        if(flag != 0){
+            if(Hyscheme_flag == 1){
+                tmp_r = OCFD_OMP6_kernel_M(0, &stencil[0]);
+            }else if(Hyscheme_flag == 2){
+                tmp_r = OCFD_weno7_kernel_M(&stencil[1]);
+            }else{
+                tmp_r = OCFD_NND2_kernel_M(&stencil[3]);
+            }
+        }
+    
+        tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+    
+        if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+    }
+}
+
+
+__global__ void OCFD_HybridAuto_M_Jameson_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
+    extern __shared__ REAL sort[];
+	dim3 coords;
+	REAL stencil[8];
+    int Hyscheme_flag;
+
+	int ia1 = -4; int ib1 = 3;
+
+    int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
+    
+    if(flag != 0){
+        REAL tmp_r, tmp_l; 
+
+        if(i == 0) Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
+    
+        flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);  
+    
+        if(flag != 0){
+            if(Hyscheme_flag == 1){
+                tmp_r = OCFD_UP7_kernel_M(&stencil[0]);
+            }else if(Hyscheme_flag == 2){
+                tmp_r = OCFD_weno7_kernel_M(&stencil[1]);
+            }else{
+                tmp_r = OCFD_weno5_kernel_M(&stencil[2]);
+            }
+        }
+    
+        tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
+    
+        if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
+    }
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_Stream.cu
+++ b/src/OCFD_Stream.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "OCFD_Stream.h"
+#include "OCFD_split.h"
+#include "OCFD_NS_Jacobian3d.h"
+#include "parameters.h"
+#include "OCFD_mpi_dev.h"
+#include "parameters_d.h"
+#include "commen_kernel.h"
+#include "OCFD_Schemes_hybrid_auto.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//static cudaStream_t Stream[15];
+
+void opencfd_mem_init_Stream(){
+    for (int i = 0; i < 4; i++) cudaStreamCreate(&Stream[i]);
+    for (int i = 0; i < 4; i++) cudaEventCreate(&Event[i]);
+}
+
+void opencfd_mem_finalize_Stream(){
+    for (int i = 0; i < 4; ++i) cudaStreamDestroy(Stream[i]);
+    for (int i = 0; i < 4; ++i) cudaEventDestroy(Event[i]);
+}
+
+void du_comput(int KRK){
+	//pthread_create(&thread_handles[0], NULL, du_invis_Jacobian3d_inner, NULL);
+	//pthread_create(&thread_handles[1], NULL, du_vis_Jacobian3d_outer, NULL);
+
+	//for(int thread = 0; thread < 2; thread++)
+	//	pthread_join(thread_handles[thread], NULL);
+	if(IFLAG_HybridAuto == 1 && KRK == 1) Set_Scheme_HybridAuto(&Stream[0]);
+
+	cuda_mem_value_init_warp(0.0 ,pdu_d->ptr, pdu_d->pitch, nx, ny, nz*5);
+
+	switch(Stream_MODE){
+        case 0://Non-stream
+	    du_invis_Jacobian3d(NULL);
+	    du_vis_Jacobian3d(NULL);
+        break;
+
+        case 1://launch: first invis, then vis
+        //du_invis_Jacobian3d_all(NULL);
+	    //du_vis_Jacobian3d_all(NULL);
+        du_Jacobian3d_all(NULL);
+		break;
+		
+		default: 
+		if(my_id == 0) printf("\033[31mWrong Stream Mode! Please choose 0 or 1, 0:non stream; 1:stream\033[0m\n");
+    }
+
+}
+
+/*
+
+void *du_Jacobian3d_all(void* pthread_id){
+    cudaJobPackage job(dim3(2*LAP, 2*LAP, 2*LAP), dim3(nx, ny, nz));
+//cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, nz_lap));
+
+	du_invis_Jacobian3d_init(job, &Stream[0]);//内区声速计算
+
+	job.setup(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
+	
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, &Stream[0]);
+    du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, &Stream[0]);//内区无粘项计算
+    du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, &Stream[0]);
+    du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, &Stream[0]);
+
+    du_invis_Jacobian3d_outer_exchange(&Stream[2]);//交换原始变量
+    cudaDeviceSynchronize();
+
+    du_invis_Jacobian3d_outer_x(&Stream[1]);//外区计算
+    du_invis_Jacobian3d_outer_y(&Stream[1]);
+    du_invis_Jacobian3d_outer_z(&Stream[1]);
+
+    cudaDeviceSynchronize();
+
+    du_viscous_Jacobian3d_init(&Stream[2]);//开始算粘性项全体导数
+    cudaDeviceSynchronize();
+
+    du_viscous_Jacobian3d_x_init(&Stream[2]);//粘性项计算
+    du_vis_Jacobian3d_inner_x(&Stream[2]);//内区开始计算
+    cudaDeviceSynchronize();
+
+    du_vis_Jacobian3d_outer_x(&Stream[3]);//外区x计算
+    cudaDeviceSynchronize();
+
+    du_viscous_Jacobian3d_y_init(&Stream[2]);
+    du_vis_Jacobian3d_inner_y(&Stream[2]);//内区开始计算
+    cudaDeviceSynchronize();
+
+    du_vis_Jacobian3d_outer_y(&Stream[3]);//外区x计算
+    cudaDeviceSynchronize();
+
+    du_viscous_Jacobian3d_z_init(&Stream[2]);
+    du_vis_Jacobian3d_inner_z(&Stream[2]);//内区开始计算
+    cudaDeviceSynchronize();
+
+    du_vis_Jacobian3d_outer_z(&Stream[3]);//外区x计算
+
+	return NULL;
+}
+*/
+
+
+void *du_Jacobian3d_all(void* pthread_id){
+    cudaJobPackage job(dim3(2*LAP, 2*LAP, 2*LAP), dim3(nx, ny, nz));
+	du_invis_Jacobian3d_init(job, &Stream[0]);//内区声速计算
+
+	job.setup(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
+	
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, &Stream[0]);
+    du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, &Stream[0]);//内区无粘项计算
+    du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, &Stream[0]);
+    du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, &Stream[0]);
+
+    du_invis_Jacobian3d_outer_exchange(&Stream[1]);//交换原始变量
+
+    cudaEventRecord(Event[1], Stream[1]);//记录数据交换情况
+    cudaStreamWaitEvent(Stream[2], Event[1], 0);//外区等待粘性计算完
+
+    du_invis_Jacobian3d_outer_x(&Stream[1]);//外区计算
+
+    du_viscous_Jacobian3d_init(&Stream[2]);//开始算粘性项全体导数
+    du_viscous_Jacobian3d_x_init(&Stream[2]);//粘性项计算
+    cudaEventRecord(Event[2], Stream[2]);//记录粘性计算
+    du_vis_Jacobian3d_inner_x(&Stream[2]);//内区开始计算
+
+    cudaStreamWaitEvent(Stream[3], Event[2], 0);//外区等待粘性计算完
+    du_vis_Jacobian3d_outer_x(&Stream[3]);//外区x计算
+    cudaEventRecord(Event[3], Stream[3]);
+
+    du_invis_Jacobian3d_outer_y(&Stream[1]);
+
+    cudaStreamWaitEvent(Stream[2], Event[3], 0);
+    du_viscous_Jacobian3d_y_init(&Stream[2]);
+    cudaEventRecord(Event[2], Stream[2]);//记录粘性计算
+    du_vis_Jacobian3d_inner_y(&Stream[2]);//内区开始计算
+
+    cudaStreamWaitEvent(Stream[3], Event[2], 0);//外区等待粘性计算完 
+    du_vis_Jacobian3d_outer_y(&Stream[3]);//外区x计算
+    cudaEventRecord(Event[3], Stream[3]);
+
+    du_invis_Jacobian3d_outer_z(&Stream[1]);
+
+    cudaStreamWaitEvent(Stream[2], Event[3], 0);
+    du_viscous_Jacobian3d_z_init(&Stream[2]);
+    cudaEventRecord(Event[2], Stream[2]);//记录粘性计算
+    du_vis_Jacobian3d_inner_z(&Stream[2]);//内区开始计算
+
+    cudaStreamWaitEvent(Stream[3], Event[2], 0);
+    du_vis_Jacobian3d_outer_z(&Stream[3]);//外区x计算
+
+	return NULL;
+}
+
+/*void* du_invis_Jacobian3d_all(void* pthread_id){
+
+	cudaJobPackage job(dim3(2*LAP, 2*LAP, 2*LAP), dim3(nx, ny, nz));
+	du_invis_Jacobian3d_init(job, &Stream[0]);
+
+	job.setup(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
+    //direction X ------------------------------
+	
+	du_invis_Jacobian3d_x(job, pfp_d, pfm_d, &Stream[0]);
+	du_invis_Jacobian3d_outer_x(&Stream[1]);
+
+    //direction Y ------------------------------
+
+	cudaEventRecord(Event[0], Stream[0]);
+	cudaEventRecord(Event[1], Stream[1]);
+	cudaStreamWaitEvent(Stream[0], Event[1], 0);
+	du_invis_Jacobian3d_y(job, pfp_d, pfm_d, &Stream[0]);
+	du_invis_Jacobian3d_outer_y(&Stream[1], &Event[0]);
+
+    //direction Z ------------------------------
+
+	cudaEventRecord(Event[0], Stream[0]);
+	cudaEventRecord(Event[1], Stream[1]);
+	cudaStreamWaitEvent(Stream[0], Event[1], 0);
+	du_invis_Jacobian3d_z(job, pfp_d, pfm_d, &Stream[0]);
+	du_invis_Jacobian3d_outer_z(&Stream[1], &Event[0]);
+	cudaEventRecord(Event[1], Stream[1]);
+
+	return NULL;
+}*/
+
+//void* du_vis_Jacobian3d_all(void* pthread_id){
+//
+//	cudaStreamWaitEvent(Stream[2], Event[1], 0);
+//	du_viscous_Jacobian3d_init(&Stream[2]);
+//
+//    //direction X ------------------------------
+//	
+//	du_viscous_Jacobian3d_x_init(&Stream[2]);
+//	cudaEventRecord(Event[2], Stream[2]);
+//	du_vis_Jacobian3d_inner_x(&Stream[2]);
+//	cudaStreamWaitEvent(Stream[1], Event[2], 0);
+//	du_vis_Jacobian3d_outer_x(&Stream[1]);
+//
+//    //direction Y ------------------------------
+//
+//	cudaEventRecord(Event[2], Stream[1]);
+//	cudaStreamWaitEvent(Stream[2], Event[2], 0);
+//	du_viscous_Jacobian3d_y_init(&Stream[2]);
+//	cudaEventRecord(Event[2], Stream[2]);
+//	du_vis_Jacobian3d_inner_y(&Stream[2]);
+//	cudaStreamWaitEvent(Stream[1], Event[2], 0);
+//	du_vis_Jacobian3d_outer_y(&Stream[1]);
+//
+//    //direction X ------------------------------
+//	
+//	cudaEventRecord(Event[2], Stream[1]);
+//	cudaStreamWaitEvent(Stream[2], Event[2], 0);
+//	du_viscous_Jacobian3d_z_init(&Stream[2]);
+//	cudaEventRecord(Event[2], Stream[2]);
+//	du_vis_Jacobian3d_inner_z(&Stream[2]);
+//	cudaStreamWaitEvent(Stream[1], Event[2], 0);
+//	du_vis_Jacobian3d_outer_z(&Stream[1]);
+//	
+//
+//	return NULL;
+//}
+void* du_vis_Jacobian3d_all(void* pthread_id){
+
+	du_viscous_Jacobian3d_init(&Stream[2]);
+
+    //direction X ------------------------------
+	
+	du_viscous_Jacobian3d_x_init(&Stream[2]);
+	cudaEventRecord(Event[2], Stream[2]);
+	du_vis_Jacobian3d_inner_x(&Stream[2]);
+	cudaStreamWaitEvent(Stream[3], Event[1], 0);
+	cudaStreamWaitEvent(Stream[3], Event[2], 0);
+	du_vis_Jacobian3d_outer_x(&Stream[3]);
+
+    //direction Y ------------------------------
+
+	cudaEventRecord(Event[2], Stream[3]);
+	cudaStreamWaitEvent(Stream[2], Event[2], 0);
+	du_viscous_Jacobian3d_y_init(&Stream[2]);
+	cudaEventRecord(Event[2], Stream[2]);
+	du_vis_Jacobian3d_inner_y(&Stream[2]);
+	cudaStreamWaitEvent(Stream[3], Event[2], 0);
+	du_vis_Jacobian3d_outer_y(&Stream[3]);
+
+    //direction Z ------------------------------
+	
+	cudaEventRecord(Event[2], Stream[3]);
+	cudaStreamWaitEvent(Stream[2], Event[2], 0);
+	du_viscous_Jacobian3d_z_init(&Stream[2]);
+	cudaEventRecord(Event[2], Stream[2]);
+	du_vis_Jacobian3d_inner_z(&Stream[2]);
+	cudaStreamWaitEvent(Stream[3], Event[2], 0);
+	du_vis_Jacobian3d_outer_z(&Stream[3]);
+	
+
+	return NULL;
+}
+
+
+void* du_vis_Jacobian3d_inner_x(cudaStream_t *stream){
+
+	cudaJobPackage job(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
+	du_viscous_Jacobian3d_x_final(job, stream);
+
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_inner_y(cudaStream_t *stream){
+
+	cudaJobPackage job(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
+	du_viscous_Jacobian3d_y_final(job, stream);
+
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_inner_z(cudaStream_t *stream){
+
+	cudaJobPackage job(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
+	du_viscous_Jacobian3d_z_final(job, stream);
+
+	return NULL;
+}
+
+
+void* du_invis_Jacobian3d(void* pthread_id){
+
+	exchange_boundary_xyz_packed_dev(pd , pd_d);
+	exchange_boundary_xyz_packed_dev(pu , pu_d);
+	exchange_boundary_xyz_packed_dev(pv , pv_d); 
+	exchange_boundary_xyz_packed_dev(pw , pw_d);
+	exchange_boundary_xyz_packed_dev(pT , pT_d);
+
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, nz_lap));
+	du_invis_Jacobian3d_init(job, &Stream[0]);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, &Stream[0]);
+	du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, &Stream[0]);
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, &Stream[0]);
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, &Stream[0]);
+
+	return NULL;
+}
+
+void* du_vis_Jacobian3d(void* pthread_id){
+
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, nz_lap));
+	du_viscous_Jacobian3d_init(&Stream[0]);
+	
+	du_viscous_Jacobian3d_x_init(&Stream[0]);
+	exchange_boundary_x_packed_dev(pEv1 , pEv1_d , Iperiodic[0]);
+    exchange_boundary_x_packed_dev(pEv2 , pEv2_d , Iperiodic[0]);
+    exchange_boundary_x_packed_dev(pEv3 , pEv3_d , Iperiodic[0]);
+    exchange_boundary_x_packed_dev(pEv4 , pEv4_d , Iperiodic[0]);
+	du_viscous_Jacobian3d_x_final(job, &Stream[0]);
+
+	du_viscous_Jacobian3d_y_init(&Stream[0]);
+	exchange_boundary_y_packed_dev(pEv1 , pEv1_d , Iperiodic[1]);
+    exchange_boundary_y_packed_dev(pEv2 , pEv2_d , Iperiodic[1]);
+    exchange_boundary_y_packed_dev(pEv3 , pEv3_d , Iperiodic[1]);
+	exchange_boundary_y_packed_dev(pEv4 , pEv4_d , Iperiodic[1]);
+	boundary_symmetry_pole_vis_y(&Stream[0]);
+	du_viscous_Jacobian3d_y_final(job, &Stream[0]);
+
+	du_viscous_Jacobian3d_z_init(&Stream[0]);
+	exchange_boundary_z_packed_dev(pEv1 , pEv1_d ,Iperiodic[2]);
+    exchange_boundary_z_packed_dev(pEv2 , pEv2_d ,Iperiodic[2]);
+    exchange_boundary_z_packed_dev(pEv3 , pEv3_d ,Iperiodic[2]);
+    exchange_boundary_z_packed_dev(pEv4 , pEv4_d ,Iperiodic[2]);
+	du_viscous_Jacobian3d_z_final(job, &Stream[0]);
+
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_init_x(cudaStream_t *stream){
+//-------------x outer p init----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_invis_Jacobian3d_init(job, stream);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
+//-------------x outer m init----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_invis_Jacobian3d_init(job, stream);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
+	
+	return NULL;
+}
+
+
+void* du_invis_Jacobian3d_outer_x_x(cudaStream_t *stream){
+//-------------x outer p x----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
+//-------------x outer m x----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_y_x(cudaStream_t *stream){
+//-------------x outer p y----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
+//-------------x outer m y----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_z_x(cudaStream_t *stream){
+//-------------x outer p z----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
+//-------------x outer m z----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_init_y(cudaStream_t *stream){
+//-------------y outer p init----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_invis_Jacobian3d_init(job, stream);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
+
+//-------------y outer m init----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_invis_Jacobian3d_init(job, stream);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
+		
+	return NULL;
+}
+	
+
+void* du_invis_Jacobian3d_outer_x_y(cudaStream_t *stream){
+//-------------y outer p x----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
+//-------------y outer m x----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_y_y(cudaStream_t *stream){
+//-------------y outer p y----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
+//-------------y outer m y----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_z_y(cudaStream_t *stream){
+//-------------y outer p z----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
+//-------------y outer m z----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_init_z(cudaStream_t *stream){
+//-------------z outer p init----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_invis_Jacobian3d_init(job, stream);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
+//-------------z outer m init----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+	du_invis_Jacobian3d_init(job, stream);
+    Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
+
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_x_z(cudaStream_t *stream){
+//-------------z outer p x----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
+//-------------z outer m x----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+    du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
+
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_y_z(cudaStream_t *stream){
+//-------------z outer p----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
+//-------------z outer m----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+	du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
+
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_z_z(cudaStream_t *stream){
+//-------------z outer p----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
+//-------------z outer m----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+	du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
+
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_exchange(cudaStream_t *stream){
+
+	exchange_boundary_xyz_Async_packed_dev(pd , pd_d , stream);
+	exchange_boundary_xyz_Async_packed_dev(pu , pu_d , stream);
+	exchange_boundary_xyz_Async_packed_dev(pv , pv_d , stream); 
+	exchange_boundary_xyz_Async_packed_dev(pw , pw_d , stream);
+	exchange_boundary_xyz_Async_packed_dev(pT , pT_d , stream);
+	
+	return NULL;
+}
+
+void* du_invis_Jacobian3d_outer_x(cudaStream_t *stream){
+
+	du_invis_Jacobian3d_outer_init_x(stream);
+	du_invis_Jacobian3d_outer_x_x(stream);
+	du_invis_Jacobian3d_outer_y_x(stream);
+	du_invis_Jacobian3d_outer_z_x(stream);
+	
+	return NULL;
+}
+
+
+void* du_invis_Jacobian3d_outer_y(cudaStream_t *stream){
+	
+	du_invis_Jacobian3d_outer_init_y(stream);
+	du_invis_Jacobian3d_outer_x_y(stream);
+	du_invis_Jacobian3d_outer_y_y(stream);
+	du_invis_Jacobian3d_outer_z_y(stream);
+	
+	return NULL;
+}
+
+
+void* du_invis_Jacobian3d_outer_z(cudaStream_t *stream){
+
+	du_invis_Jacobian3d_outer_init_z(stream);
+	du_invis_Jacobian3d_outer_x_z(stream);
+	du_invis_Jacobian3d_outer_y_z(stream);
+	du_invis_Jacobian3d_outer_z_z(stream);
+	
+	return NULL;
+}
+
+
+void* du_vis_Jacobian3d_outer_x_x(cudaStream_t *stream){
+
+//-------------x outer p x----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_viscous_Jacobian3d_x_final(job, stream);
+//-------------x outer m x----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_viscous_Jacobian3d_x_final(job, stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_y_x(cudaStream_t *stream){
+//-------------x outer p y----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_viscous_Jacobian3d_y_final(job, stream);
+//-------------x outer m y----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_viscous_Jacobian3d_y_final(job, stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_z_x(cudaStream_t *stream){
+//-------------x outer p z----------------
+	cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
+	du_viscous_Jacobian3d_z_final(job, stream);
+//-------------x outer m z----------------
+	job.start.x = nx-LAP;
+	job.end.x = nx_lap;
+	du_viscous_Jacobian3d_z_final(job, stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_x_y(cudaStream_t *stream){
+//-------------y outer p x----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_viscous_Jacobian3d_x_final(job, stream);
+//-------------y outer m x----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_viscous_Jacobian3d_x_final(job, stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_y_y(cudaStream_t *stream){
+//-------------y outer p y----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_viscous_Jacobian3d_y_final(job, stream);
+//-------------y outer m y----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_viscous_Jacobian3d_y_final(job, stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_z_y(cudaStream_t *stream){
+//-------------y outer p z----------------
+	cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
+	du_viscous_Jacobian3d_z_final(job, stream);
+//-------------y outer m z----------------
+	job.start.y = ny-LAP;
+	job.end.y = ny_lap;
+	du_viscous_Jacobian3d_z_final(job, stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_x_z(cudaStream_t *stream){
+//-------------z outer p x----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_viscous_Jacobian3d_x_final(job, stream);
+//-------------z outer m x----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+	du_viscous_Jacobian3d_x_final(job, stream);
+
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_y_z(cudaStream_t *stream){
+//-------------z outer p y----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_viscous_Jacobian3d_y_final(job, stream);
+//-------------z outer m y----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+	du_viscous_Jacobian3d_y_final(job, stream);
+
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_z_z(cudaStream_t *stream){
+//-------------z outer p z----------------
+	cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
+	du_viscous_Jacobian3d_z_final(job, stream);
+//-------------z outer m z----------------
+	job.start.z = nz-LAP;
+	job.end.z = nz_lap;
+	du_viscous_Jacobian3d_z_final(job, stream);
+
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_x(cudaStream_t *stream){
+
+	exchange_boundary_x_Async_packed_dev(pEv1 , pEv1_d , Iperiodic[0], stream);
+    exchange_boundary_x_Async_packed_dev(pEv2 , pEv2_d , Iperiodic[0], stream);
+    exchange_boundary_x_Async_packed_dev(pEv3 , pEv3_d , Iperiodic[0], stream);
+    exchange_boundary_x_Async_packed_dev(pEv4 , pEv4_d , Iperiodic[0], stream);
+
+	du_vis_Jacobian3d_outer_x_x(stream);
+	du_vis_Jacobian3d_outer_x_y(stream);
+	du_vis_Jacobian3d_outer_x_z(stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_y(cudaStream_t *stream){
+
+	exchange_boundary_y_Async_packed_dev(pEv1 , pEv1_d , Iperiodic[1], stream);
+    exchange_boundary_y_Async_packed_dev(pEv2 , pEv2_d , Iperiodic[1], stream);
+    exchange_boundary_y_Async_packed_dev(pEv3 , pEv3_d , Iperiodic[1], stream);
+	exchange_boundary_y_Async_packed_dev(pEv4 , pEv4_d , Iperiodic[1], stream);
+	
+	boundary_symmetry_pole_vis_y(stream);
+
+	du_vis_Jacobian3d_outer_y_x(stream);
+	du_vis_Jacobian3d_outer_y_y(stream);
+	du_vis_Jacobian3d_outer_y_z(stream);
+	
+	return NULL;
+}
+
+void* du_vis_Jacobian3d_outer_z(cudaStream_t *stream){
+
+	exchange_boundary_z_Async_packed_dev(pEv1 , pEv1_d , Iperiodic[2], stream);
+    exchange_boundary_z_Async_packed_dev(pEv2 , pEv2_d , Iperiodic[2], stream);
+    exchange_boundary_z_Async_packed_dev(pEv3 , pEv3_d , Iperiodic[2], stream);
+    exchange_boundary_z_Async_packed_dev(pEv4 , pEv4_d , Iperiodic[2], stream);
+
+	du_vis_Jacobian3d_outer_z_x(stream);
+	du_vis_Jacobian3d_outer_z_y(stream);
+	du_vis_Jacobian3d_outer_z_z(stream);
+
+	return NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_ana.cu
+++ b/src/OCFD_ana.cu
+#include "OCFD_ana.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+__global__ void get_inner_kernel(cudaField x1, cudaField x2, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field(x1, x-LAP, y-LAP, z-LAP) = get_Field_LAP(x2, x, y, z);
+	}
+}
+
+void get_inner(cudaField x1, cudaField x2){
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
+    cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
+    get_inner_kernel<<<griddim , blockdim>>>(x1, x2, job);
+}
+
+void ana_Jac(){
+    // check NAN in d u v w T
+    // check Negative T 
+    int i,j,k,flag = 0;
+    unsigned long int offset;
+
+    memcpy_All(pAjac , pAjac_d->ptr , pAjac_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+
+    for(k=0;k<nz_2lap;k++){
+        for(j=0;j<ny_2lap;j++){
+            for(i=0;i<nx_2lap;i++){
+                offset = i + nx_2lap*(j + k*ny_2lap);
+                if( *(pAjac + offset) < 0 ){
+                    printf("\033[31mNegative Jac occured in %d , %d , %d\033[0m\n",
+                     i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+                    flag = 1;
+                    //goto end_Jac;
+                }
+            }
+        }
+    }
+    //end_Jac:;
+
+    if(flag == 1) exit(0);
+}
+
+__global__ void add_E_kernel(cudaField pE, int SMEMDIM, REAL *g_odata, cudaJobPackage job){
+    extern __shared__ REAL shared[];
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+    unsigned int Id = blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
+    unsigned int warpId  = Id / warpSize;
+    unsigned int laneIdx = Id % warpSize;
+    REAL grad_f0 = 0.;
+    
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        grad_f0 = get_Field(pE, x, y, z);
+    }
+
+    grad_f0 = warpReduce(grad_f0);
+
+    if(laneIdx == 0) shared[warpId] = grad_f0;
+    __syncthreads();
+
+    grad_f0 = (Id < SMEMDIM)?shared[Id]:0;
+
+    if(warpId == 0) grad_f0 = warpReduce(grad_f0);
+    if(Id == 0) g_odata[blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z] = grad_f0;
+}
+
+void ana_residual(cudaField PE_d, REAL *E0){
+
+    dim3 size, griddim, blockdim;
+    cudaJobPackage job(dim3(0, 0, 0), dim3(nx, ny, nz));
+	jobsize(&job, &size);
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
+
+    REAL *g_odata;
+    REAL *Sum = (REAL *)malloc(sizeof(REAL));
+
+    unsigned int g_odata_size = griddim.x*griddim.y*griddim.z;
+    CUDA_LAUNCH(( cudaMalloc((REAL **)&g_odata, g_odata_size*sizeof(REAL)) ));
+
+    int SMEMDIM = blockdim.x*blockdim.y*blockdim.z/64;   //Warpsize is 64
+    CUDA_LAUNCH((add_E_kernel<<<griddim, blockdim, SMEMDIM>>>(PE_d, SMEMDIM, g_odata, job)));
+
+    dim3 blockdim_sum(512);
+    dim3 griddim_sum(g_odata_size); 
+
+    do{
+        griddim_sum.x = (griddim_sum.x + blockdim_sum.x - 1)/blockdim_sum.x;
+        CUDA_LAUNCH(( add_kernel<<<griddim_sum, blockdim_sum, 8>>>(g_odata, g_odata_size) ));
+    } while(griddim_sum.x > 1);
+
+    CUDA_LAUNCH(( cudaMemcpy(Sum, g_odata, sizeof(REAL), cudaMemcpyDeviceToHost) ));
+    CUDA_LAUNCH(( cudaFree(g_odata) ));
+
+    MPI_Allreduce(Sum, E0, 1, OCFD_DATA_TYPE, MPI_SUM, MPI_COMM_WORLD);
+
+    *E0 /= NX_GLOBAL * NY_GLOBAL * NZ_GLOBAL;
+}
+
+void ana_NAN_and_NT(){
+    // check NAN in d u v w T
+    // check Negative T
+
+    //if(N_ana < 0 || Istep % Kstep_ana != 0) return;
+    int i,j,k;
+    unsigned long int offset;
+
+    unsigned int n_NT_limit = 10;
+    char has_nan = 0;
+    unsigned long int n_NT = 0;
+
+
+    //memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    //memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    //memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    //memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+
+    if(my_id == 0) printf("It is analyzing NAN......\n");
+     
+    //for(k=0;k<nz_2lap;k++){
+    //    for(j=0;j<ny_2lap;j++){
+    //        for(i=0;i<nx_2lap;i++){
+    //            offset = i + nx_2lap*(j + k*ny_2lap);
+    //            if( isnan( *(pd + offset) ) ){
+    //                has_nan = 1;
+    //                printf("\033[31mNAN occured in d(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+    //                goto end_d;
+    //            }
+    //        }
+    //    }
+    //}
+    //end_d:;
+
+    //for(k=0;k<nz_2lap;k++){
+    //    for(j=0;j<ny_2lap;j++){
+    //        for(i=0;i<nx_2lap;i++){
+    //            offset = i + nx_2lap*(j + k*ny_2lap);
+    //            if( isnan( *(pu + offset) ) ){
+    //                has_nan = 1;
+    //                printf("\033[31mNAN occured in u(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+    //                goto end_u;
+    //            }
+    //        }
+    //    }
+    //}
+    //end_u:;
+
+    //for(k=0;k<nz_2lap;k++){
+    //    for(j=0;j<ny_2lap;j++){
+    //        for(i=0;i<nx_2lap;i++){
+    //            offset = i + nx_2lap*(j + k*ny_2lap);
+    //            if( isnan( *(pv + offset) ) ){
+    //                has_nan = 1;
+    //                printf("\033[31mNAN occured in v(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+    //                goto end_v;
+    //            }
+    //        }
+    //    }
+    //}
+    //end_v:;
+
+    //for(k=0;k<nz_2lap;k++){
+    //    for(j=0;j<ny_2lap;j++){
+    //        for(i=0;i<nx_2lap;i++){
+    //            offset = i + nx_2lap*(j + k*ny_2lap);
+    //            if( isnan( *(pw + offset) ) ){
+    //                has_nan = 1;
+    //                printf("\033[31mNAN occured in w(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+    //                goto end_w;
+    //            }
+    //        }
+    //    }
+    //}
+    //end_w:;
+
+    for(k=0;k<nz_2lap;k++){
+        for(j=0;j<ny_2lap;j++){
+            for(i=0;i<nx_2lap;i++){
+                offset = i + nx_2lap*(j + k*ny_2lap);
+                if( isnan( *(pT + offset) ) ){
+                    has_nan = 1;
+                    //printf("\033[31mNAN occured in T(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+                    printf("\033[31mNAN occured in Global ID(%d , %d , %d)\033[0m\n\n",i_offset[npx]+i-LAP,j_offset[npy]+j-LAP,k_offset[npz]+k-LAP);
+                    goto end_T;
+                }
+            }
+        }
+    }
+    end_T:;
+
+    for(k=0;k<nz_2lap;k++){
+        for(j=0;j<ny_2lap;j++){
+            for(i=0;i<nx_2lap;i++){
+                offset = i + nx_2lap*(j + k*ny_2lap);
+                if( *(pT + offset) < 0 ){
+                    n_NT++;
+                    //printf("\033[31mNegative T occured in T(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
+                    printf("\033[31mNegative T occured in Global ID(%d , %d , %d)\033[0m\n\n",i_offset[npx]+i-LAP,j_offset[npy]+j-LAP,k_offset[npz]+k-LAP);
+                }
+            }
+        }
+    }
+    if(n_NT > n_NT_limit){
+        printf("\033[31mNegative T points %ld > %d\033[0m on Proc(%d , %d , %d)\033[0m\n",n_NT , n_NT_limit,npx,npy,npz);
+        MPI_Abort(MPI_COMM_WORLD , 1);
+    }
+    if( has_nan ){
+        if(my_id == 0) printf("\033[31mNAN occured , program Abort\033[0m\n");
+        MPI_Abort(MPI_COMM_WORLD , 1);
+    }
+
+    //cudaStreamDestroy(ana_NT_stream);
+}
+
+__global__ void init_time_average_kernel(cudaField d1, cudaField u1, cudaField v1, cudaField w1, cudaField T1, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    //REAL a = get_Field_LAP(d, x, y, z);
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(d1, x, y, z) = 0.;
+        get_Field_LAP(u1, x, y, z) = 0.;
+        get_Field_LAP(v1, x, y, z) = 0.;
+        get_Field_LAP(w1, x, y, z) = 0.;
+        get_Field_LAP(T1, x, y, z) = 0.;
+    }
+}
+
+__global__ void ana_time_average_kernel(cudaField d1, cudaField u1, cudaField v1, cudaField w1, cudaField T1, 
+    cudaField d, cudaField u, cudaField v, cudaField w, cudaField T, int Istep, cudaJobPackage job){
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+    //REAL a = get_Field_LAP(d, x, y, z);
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        get_Field_LAP(d1, x, y, z) = (Istep * get_Field_LAP(d1, x, y, z) + get_Field_LAP(d, x, y, z))/(Istep + 1.);
+        get_Field_LAP(u1, x, y, z) = (Istep * get_Field_LAP(u1, x, y, z) + get_Field_LAP(u, x, y, z))/(Istep + 1.);
+        get_Field_LAP(v1, x, y, z) = (Istep * get_Field_LAP(v1, x, y, z) + get_Field_LAP(v, x, y, z))/(Istep + 1.);
+        get_Field_LAP(w1, x, y, z) = (Istep * get_Field_LAP(w1, x, y, z) + get_Field_LAP(w, x, y, z))/(Istep + 1.);
+        get_Field_LAP(T1, x, y, z) = (Istep * get_Field_LAP(T1, x, y, z) + get_Field_LAP(T, x, y, z))/(Istep + 1.);
+    }
+}
+
+void ana_time_average(){
+    if(my_id == 0) printf("It is averaging......\n");
+    if(average_IO == 1){
+        int tmp_size = (nx + 2 * LAP) * (ny + 2 * LAP) * (nz + 2 * LAP) * sizeof(REAL);
+        pdm = (REAL *)malloc_me(tmp_size);
+        pum = (REAL *)malloc_me(tmp_size);
+        pvm = (REAL *)malloc_me(tmp_size);
+        pwm = (REAL *)malloc_me(tmp_size);
+        pTm = (REAL *)malloc_me(tmp_size);
+    
+        new_cudaField(&pdm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        new_cudaField(&pum_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        new_cudaField(&pvm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        new_cudaField(&pwm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        new_cudaField(&pTm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+
+        read_file(average_IO, pdm, pum, pvm, pwm, pTm);
+    }
+
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
+
+    cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
+
+    CUDA_LAUNCH(( ana_time_average_kernel<<<griddim , blockdim>>>(*pdm_d, *pum_d, *pvm_d, *pwm_d, *pTm_d, 
+                                               *pd_d, *pu_d, *pv_d, *pw_d, *pT_d, Istep_average, job) ));
+
+    Istep_average += 1;
+    tt_average += dt;
+
+    if(Istep%Kstep_save == 0){
+        memcpy_All(pdm , pdm_d->ptr , pdm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        memcpy_All(pum , pum_d->ptr , pum_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        memcpy_All(pvm , pvm_d->ptr , pvm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        memcpy_All(pwm , pwm_d->ptr , pwm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        memcpy_All(pTm , pTm_d->ptr , pTm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
+        OCFD_save(1, Istep_average, pdm, pum, pvm, pwm, pTm);
+    }
+
+    if(tt == end_time){
+        free(pdm);
+        free(pum);
+        free(pvm);
+        free(pwm);
+        free(pTm);
+
+        delete_cudaField(pdm_d);
+        delete_cudaField(pum_d);
+        delete_cudaField(pvm_d);
+        delete_cudaField(pwm_d);
+        delete_cudaField(pTm_d);
+    }
+}
+
+
+void init_time_average(){
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
+
+    cudaJobPackage job(dim3(0,0,0) , dim3(nx_2lap,ny_2lap,nz_2lap));
+
+    CUDA_LAUNCH(( init_time_average_kernel<<<griddim , blockdim>>>(*pdm_d, *pum_d, *pvm_d, *pwm_d, *pTm_d, job) ));
+}
+
+__global__ void get_Q_kernal(
+    cudaField ui,
+    cudaField us,
+    cudaField uk,
+    cudaField vi,
+    cudaField vs,
+    cudaField vk,
+    cudaField wi,
+    cudaField ws,
+    cudaField wk,
+    cudaField Akx,
+    cudaField Aky,
+    cudaField Akz,
+    cudaField Aix,
+    cudaField Aiy,
+    cudaField Aiz,
+    cudaField Asx,
+    cudaField Asy,
+    cudaField Asz,
+    cudaField Ajac,
+    cudaField Q,
+    cudaJobPackage job){
+
+    unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+
+    if(x < job.end.x && y < job.end.y && z < job.end.z){
+        REAL ux = get_Field(uk, x, y, z)*get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)+
+                  get_Field(ui, x, y, z)*get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)+
+                  get_Field(us, x, y, z)*get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+
+        REAL vx = get_Field(vk, x, y, z)*get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)+
+                  get_Field(vi, x, y, z)*get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)+
+                  get_Field(vs, x, y, z)*get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+
+        REAL wx = get_Field(wk, x, y, z)*get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)+
+                  get_Field(wi, x, y, z)*get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)+
+                  get_Field(ws, x, y, z)*get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
+
+        REAL uy = get_Field(uk, x, y, z)*get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)+
+                  get_Field(ui, x, y, z)*get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)+
+                  get_Field(us, x, y, z)*get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+
+        REAL vy = get_Field(vk, x, y, z)*get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)+
+                  get_Field(vi, x, y, z)*get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)+
+                  get_Field(vs, x, y, z)*get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+
+        REAL wy = get_Field(wk, x, y, z)*get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)+
+                  get_Field(wi, x, y, z)*get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)+
+                  get_Field(ws, x, y, z)*get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
+
+        REAL uz = get_Field(uk, x, y, z)*get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)+
+                  get_Field(ui, x, y, z)*get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)+
+                  get_Field(us, x, y, z)*get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+
+        REAL vz = get_Field(vk, x, y, z)*get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)+
+                  get_Field(vi, x, y, z)*get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)+
+                  get_Field(vs, x, y, z)*get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+
+        REAL wz = get_Field(wk, x, y, z)*get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)+
+                  get_Field(wi, x, y, z)*get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)+
+                  get_Field(ws, x, y, z)*get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
+
+        get_Field_LAP(Q, x+LAP, y+LAP, z+LAP) = (ux*vy + ux*wz + vy*wz - uy*vx - uz*wx - vz*wy)*
+                                get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP)*
+                                get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP);
+    }
+}
+
+void get_Q(){
+    cudaField *ui; new_cudaField(&ui, nx, ny, nz);
+    cudaField *us; new_cudaField(&us, nx, ny, nz);
+    cudaField *uk; new_cudaField(&uk, nx, ny, nz);
+    cudaField *vi; new_cudaField(&vi, nx, ny, nz);
+    cudaField *vs; new_cudaField(&vs, nx, ny, nz);
+    cudaField *vk; new_cudaField(&vk, nx, ny, nz);
+    cudaField *wi; new_cudaField(&wi, nx, ny, nz);
+    cudaField *ws; new_cudaField(&ws, nx, ny, nz);
+    cudaField *wk; new_cudaField(&wk, nx, ny, nz);
+    cudaField *Q_d; new_cudaField(&Q_d, nx, ny, nz);
+
+    cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap, nz_lap) );
+
+    OCFD_dx0(*pu_d, *uk, job, BlockDim_X, &Stream[0], D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pv_d, *vk, job, BlockDim_X, &Stream[0], D0_bound[0], D0_bound[1]);
+    OCFD_dx0(*pw_d, *wk, job, BlockDim_X, &Stream[0], D0_bound[0], D0_bound[1]);
+    OCFD_dy0(*pu_d, *ui, job, BlockDim_Y, &Stream[0], D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pv_d, *vi, job, BlockDim_Y, &Stream[0], D0_bound[2], D0_bound[3]);
+    OCFD_dy0(*pw_d, *wi, job, BlockDim_Y, &Stream[0], D0_bound[2], D0_bound[3]);
+    OCFD_dz0(*pu_d, *us, job, BlockDim_Z, &Stream[0], D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pv_d, *vs, job, BlockDim_Z, &Stream[0], D0_bound[4], D0_bound[5]);
+    OCFD_dz0(*pw_d, *ws, job, BlockDim_Z, &Stream[0], D0_bound[4], D0_bound[5]);
+
+    dim3 griddim , blockdim;
+    cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, nz);
+    job.setup( dim3(0,0,0) , dim3(nx,ny,nz) );
+
+    CUDA_LAUNCH(( get_Q_kernal<<<griddim, blockdim>>>(*ui,*us,*uk,*vi,*vs,*vk,*wi,*ws,*wk,*pAkx_d,
+        *pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,*Q_d,job) ));
+
+
+    memcpy_All(pP, Q_d->ptr, Q_d->pitch, D2H, nx_2lap, ny_2lap, nz_2lap);
+
+    MPI_File tmp_file;
+    MPI_File_open(MPI_COMM_WORLD, "Q.dat", MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &tmp_file);
+
+    write_3d1(tmp_file, 0, pP);
+
+    MPI_File_close(&tmp_file);
+
+    char filename[100];
+    FILE *fp;
+
+    sprintf(filename, "Q%02d%02d%02d.dat", npx, npy, npz);
+    fp = fopen(filename, "w");
+
+    fprintf(fp, "variables=x,y,z,Q\n");
+    fprintf(fp, "zone i=%d ,j=%d ,k=%d\n", nx, ny, nz);
+
+    for(int k = LAP; k < nz+LAP; k++){
+        for(int j = LAP; j < ny+LAP; j++){
+            for(int i = LAP; i < nx+LAP; i++){
+                fprintf(fp, "%15.6f%15.6f%15.6f%15.6f\n", *(pAxx+i+j*nx_2lap+k*nx_2lap*ny_2lap), *(pAyy+i+j*nx_2lap+k*nx_2lap*ny_2lap), *(pAzz+i+j*nx_2lap+k*nx_2lap*ny_2lap), *(pP+i+j*nx_2lap+k*nx_2lap*ny_2lap));
+            }
+        }
+    }
+    
+    delete_cudaField(ui);
+    delete_cudaField(us);
+    delete_cudaField(uk);
+    delete_cudaField(vi);
+    delete_cudaField(vs);
+    delete_cudaField(vk);
+    delete_cudaField(wi);
+    delete_cudaField(ws);
+    delete_cudaField(wk);
+    delete_cudaField(Q_d);
+
+    exit(0);
+}
+
+void ana_saveplaneXY(int ID){
+    int point = ANA_npara[ID][0];
+    int bandwidth = ANA_npara[ID][1];
+
+    FILE *fp; 
+    char fp_name[120];
+
+    memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+
+    for(int i = 0; i < point; i++){
+
+        if(my_id == 0){
+            printf("Save data ...., %d, %lf, %d\n", Istep, tt, i);
+            sprintf(fp_name, "Savedata-XY%03d.dat", i);
+            fp = fopen(fp_name, "a");
+
+            int bytes = sizeof(REAL) + sizeof(int);
+
+            fwrite(&bytes, sizeof(int), 1, fp);
+            fwrite(&Istep, sizeof(int), 1, fp);
+            fwrite(&tt, sizeof(REAL), 1, fp);
+            fwrite(&bytes, sizeof(int), 1, fp);
+        }
+
+        for(int j = ANA_npara[ID][i+2]; j <= ANA_npara[ID][i+2]+bandwidth-1; j++){
+            write_2d_XYa(fp, j, pd);
+            write_2d_XYa(fp, j, pu);
+            write_2d_XYa(fp, j, pv);
+            write_2d_XYa(fp, j, pw);
+            write_2d_XYa(fp, j, pT);
+        }
+
+        if(my_id == 0) fclose(fp);
+    }
+}
+
+void ana_saveplaneYZ(int ID){
+    int point = ANA_npara[ID][0];
+    int bandwidth = ANA_npara[ID][1];
+
+    FILE *fp; 
+    char fp_name[120];
+
+    memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+
+    for(int i = 0; i < point; i++){
+
+        if(my_id == 0){
+            printf("Save data ...., %d, %lf, %d\n", Istep, tt, i);
+            sprintf(fp_name, "Savedata-YZ%03d.dat", i);
+            fp = fopen(fp_name, "a");
+
+            //fprintf(fp, "%d%lf\n", Istep, tt);
+            int bytes = sizeof(REAL) + sizeof(int);
+
+            fwrite(&bytes, sizeof(int), 1, fp);
+            fwrite(&Istep, sizeof(int), 1, fp);
+            fwrite(&tt, sizeof(REAL), 1, fp);
+            fwrite(&bytes, sizeof(int), 1, fp);
+        }
+
+        for(int j = ANA_npara[ID][i+2]; j <= ANA_npara[ID][i+2]+bandwidth-1; j++){
+            write_2d_YZa(fp, j, pd);
+            write_2d_YZa(fp, j, pu);
+            write_2d_YZa(fp, j, pv);
+            write_2d_YZa(fp, j, pw);
+            write_2d_YZa(fp, j, pT);
+        }
+
+        if(my_id == 0) fclose(fp);
+    }
+}
+
+void ana_saveplaneXZ(int ID){
+    int point = ANA_npara[ID][0];
+    int bandwidth = ANA_npara[ID][1]; 
+
+    FILE *fp; 
+    char fp_name[120];
+
+    memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+    memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
+
+    for(int i = 0; i < point; i++){
+
+        if(my_id == 0){
+            printf("Save data ...., %d, %lf, %d\n", Istep, tt, i);
+            sprintf(fp_name, "Savedata-XZ%03d.dat", i);
+            fp = fopen(fp_name, "a");
+
+            //fprintf(fp, "%d%lf\n", Istep, tt);
+            int bytes = sizeof(REAL) + sizeof(int);
+
+            fwrite(&bytes, sizeof(int), 1, fp);
+            fwrite(&Istep, sizeof(int), 1, fp);
+            fwrite(&tt, sizeof(REAL), 1, fp);
+            fwrite(&bytes, sizeof(int), 1, fp);
+        }
+
+        for(int j = ANA_npara[ID][i+2]; j <= ANA_npara[ID][i+2]+bandwidth-1; j++){
+            write_2d_XZa(fp, j, pd);
+            write_2d_XZa(fp, j, pu);
+            write_2d_XZa(fp, j, pv);
+            write_2d_XZa(fp, j, pw);
+            write_2d_XZa(fp, j, pT);
+        }
+
+        if(my_id == 0) fclose(fp);
+    }
+}
+
+void OCFD_ana(int style, int ID){
+    switch(style){
+        case 100:
+        ana_NAN_and_NT();
+        break;
+
+        case 101:
+        ana_time_average();
+        break;
+
+        case 102:
+        HybridAuto_scheme_IO();
+        break;
+
+        case 103:
+        get_Q();
+        break;
+
+        case 104:
+        ana_saveplaneXY(ID);
+        break;
+
+        case 105:
+        ana_saveplaneYZ(ID);
+        break;
+
+        case 106:
+        ana_saveplaneXZ(ID);
+        break;
+
+        case 107:
+        if(IFLAG_HybridAuto == 1) HybridAuto_scheme_Proportion();
+        break;
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_bound_Scheme.cu
+++ b/src/OCFD_bound_Scheme.cu
+//      boundary scheme
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_Schemes.h"
+#include "parameters_d.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "mpi.h"
+
+
+
+#define PREPARE_x \
+dim3 blockdim , griddim;\
+cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , 1 , size.y , size.z);\
+
+
+#define PREPARE_y \
+dim3 blockdim , griddim;\
+cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , size.x , 1 , size.z);\
+
+
+#define PREPARE_z \
+dim3 blockdim , griddim;\
+cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , size.x , size.y , 1);\
+
+
+#ifdef DEBUG_MODE
+#define CHECK_SIZE(dir , call)\
+if(size.dir >= LAP){\
+	PREPARE_ ##dir\
+	call;\
+}else{\
+	printf("job_in.start." #dir " = %d , job_in.size." #dir " = %d\n",job_in.start.dir , size.dir);\
+	printf("illegal size , to launch %s , size." #dir " >= LAP (%d) is required\n" , __FUNCTION__ ,LAP);\
+	MPI_Abort(MPI_COMM_WORLD , 1);\
+}
+#else
+#define CHECK_SIZE(dir , call) \
+PREPARE_ ##dir\
+call;
+#endif
+
+
+#define CHECK_X(callm , callp)\
+dim3 size;\
+jobsize(&job_in , &size);\
+if(npx == 0 && job_in.start.x == LAP){\
+	CHECK_SIZE(x , callm)\
+}\
+if(npx == NPX0-1 && (job_in.start.x + size.x == nx_lap) ){\
+	CHECK_SIZE(x , callp)\
+}
+
+#define CHECK_Y(callm , callp)\
+dim3 size;\
+jobsize(&job_in , &size);\
+if(npy == 0 && job_in.start.y == LAP){\
+	CHECK_SIZE(y , callm)\
+}\
+if(npy == NPY0-1 && (job_in.start.y + size.y == ny_lap) ){\
+	CHECK_SIZE(y , callp)\
+}
+
+#define CHECK_Z(callm , callp)\
+dim3 size;\
+jobsize(&job_in , &size);\
+if(npz == 0 && job_in.start.z == LAP){\
+	CHECK_SIZE(z , callm)\
+}\
+if(npz == NPZ0-1 && (job_in.start.z + size.z == nz_lap) ){\
+	CHECK_SIZE(z , callp)\
+}
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+
+// =========================================================================================================== //
+__device__ int OCFD_D0bound_scheme_kernel(REAL* tmp, dim3 flagxyzb, dim3 coords, REAL *stencil, int ka1, cudaJobPackage job){
+
+	switch(flagxyzb.y){
+		case 1:
+		{
+			if(coords.x == 0){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1]);
+
+				return 0;
+
+			}else if(coords.x == 1){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
+
+				return 0;
+
+			}else if(coords.x == 2){
+
+				*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
+				return 0;
+
+			}else if(coords.x == 3){
+				*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
+				  -9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+				 +45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
+
+				 return 0;
+			}
+		}
+		break;
+
+		case 2:
+		{
+			if(coords.y == 0){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1]);
+
+				return 0;
+
+			}else if(coords.y == 1){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
+
+				return 0;
+
+			}else if(coords.y == 2){
+
+				*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
+				return 0;
+
+			}else if(coords.y == 3){
+				*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
+				  -9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+				 +45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
+
+				 return 0;
+			}
+		}
+		break;
+
+		case 3:
+		{
+			if(coords.z == 0){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1]);
+
+				return 0;
+
+			}else if(coords.z == 1){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
+
+				return 0;
+
+			}else if(coords.z == 2){
+
+				*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
+				return 0;
+
+			}else if(coords.z == 3){
+				*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
+				  -9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+				 +45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
+
+				 return 0;
+			}
+		}
+		break;
+
+		case 4:
+		{
+			if(coords.x == job.end.x-job.start.x-1){
+
+				*tmp = (stencil[-ka1] - stencil[-ka1-1]);
+
+				return 0;
+
+			}else if(coords.x == job.end.x-job.start.x-2){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
+
+				return 0;
+
+			}else if(coords.x == job.end.x-job.start.x-3){
+
+				*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
+				return 0;
+
+			}else if(coords.x == job.end.x-job.start.x-4){
+				*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
+				  -9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+				 +45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
+
+				 return 0;
+			}
+		}
+		break;
+
+		case 5:
+		{
+			if(coords.y == job.end.y-job.start.y-1){
+
+				*tmp = (stencil[-ka1] - stencil[-ka1-1]);
+
+				return 0;
+
+			}else if(coords.y == job.end.y-job.start.y-2){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
+
+				return 0;
+
+			}else if(coords.y == job.end.y-job.start.y-3){
+
+				*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
+				return 0;
+
+			}else if(coords.y == job.end.y-job.start.y-4){
+				*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
+				  -9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+				 +45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
+
+				 return 0;
+			}
+		}
+		break;
+
+		case 6:
+		{
+			if(coords.z == job.end.z-job.start.z-1){
+
+				*tmp = (stencil[-ka1] - stencil[-ka1-1]);
+
+				return 0;
+
+			}else if(coords.z == job.end.z-job.start.z-2){
+
+				*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
+
+				return 0;
+
+			}else if(coords.z == job.end.z-job.start.z-3){
+
+				*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
+				return 0;
+
+			}else if(coords.z == job.end.z-job.start.z-4){
+				*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
+				  -9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+				 +45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
+
+				 return 0;
+			}
+		}
+		break;
+	}
+
+
+	return 1;
+
+}
+
+
+__global__ void OCFD_Dx0_bound_kernel_m(cudaField f , cudaField fx , cudaJobPackage job){
+	// eyes on cells WITHOUT LAP
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(y < job.end.y && z < job.end.z){
+		// 0
+		get_Field(fx , 0 , y-LAP, z-LAP) = ( get_Field_LAP(f , LAP+1 , y , z) - get_Field_LAP(f , LAP , y , z))/hx_d;
+		get_Field(fx , 1 , y-LAP, z-LAP) = ( get_Field_LAP(f , LAP+2 , y , z) - get_Field_LAP(f , LAP , y , z))*0.5/hx_d;
+
+		get_Field(fx , 2 , y-LAP, z-LAP) = ( get_Field_LAP(f , LAP   , y , z) - 8.0*get_Field_LAP(f , LAP+1 , y , z) 
+								       + 8.0*get_Field_LAP(f , LAP+3 , y , z) -     get_Field_LAP(f , LAP+4 , y , z))/(12.0*hx_d);
+
+		get_Field(fx , 3 , y-LAP , z-LAP) = ( get_Field_LAP(f , LAP+6 , y , z) - get_Field_LAP(f , LAP   , y , z)
+		 						        -9.0*(get_Field_LAP(f , LAP+5 , y , z) - get_Field_LAP(f , LAP+1 , y , z) )
+								       +45.0*(get_Field_LAP(f , LAP+4 , y , z) - get_Field_LAP(f , LAP+2 , y , z)) )/(60.0*hx_d);
+
+	}
+}
+
+
+__global__ void OCFD_Dx0_bound_kernel_p(cudaField f , cudaField fx , cudaJobPackage job){
+	// eyes on cells WITHOUT LAP
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(y < job.end.y && z < job.end.z){
+		unsigned int tmp = nx_d+LAP-1;
+		get_Field(fx , nx_d - 1 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp ,   y , z) -     get_Field_LAP(f , tmp-1 , y , z))/hx_d;
+		get_Field(fx , nx_d - 2 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp ,   y , z) -     get_Field_LAP(f , tmp-2 , y , z))*0.5/hx_d;
+
+		get_Field(fx , nx_d - 3 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp-4 , y , z) - 8.0*get_Field_LAP(f , tmp-3 , y , z) 
+										       + 8.0*get_Field_LAP(f , tmp-1 , y , z) -     get_Field_LAP(f , tmp   , y , z))/(12.0*hx_d);
+
+		get_Field(fx , nx_d - 4 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp     , y , z) - get_Field_LAP(f , tmp-6     , y , z)
+		 								      -9.0*( get_Field_LAP(f , tmp - 1 , y , z) - get_Field_LAP(f , tmp - 5 , y , z) )
+									         +45.0*( get_Field_LAP(f , tmp - 2 , y , z) - get_Field_LAP(f , tmp - 4 , y , z)) )/(60.0*hx_d);
+
+	}
+}
+
+// =========================================================================================================== //
+
+__global__ void OCFD_Dy0_bound_kernel_m(cudaField f , cudaField fx , cudaJobPackage job){
+	// eyes on cells WITHOUT LAP
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(z < job.end.z && x < job.end.x){
+		get_Field(fx , x-LAP, 0, z-LAP) = ( get_Field_LAP(f , x, LAP+1, z) - get_Field_LAP(f ,x, LAP, z))/hy_d;
+		get_Field(fx , x-LAP, 1, z-LAP) = ( get_Field_LAP(f , x, LAP+2, z) - get_Field_LAP(f ,x, LAP, z))*0.5/hy_d;
+
+		get_Field(fx , x-LAP, 2, z-LAP) = ( get_Field_LAP(f , x, LAP, z) - 8.0*get_Field_LAP(f, x, LAP+1, z) 
+							          + 8.0*get_Field_LAP(f , x, LAP+3, z) -     get_Field_LAP(f, x, LAP+4, z))/(12.0*hy_d);
+
+		get_Field(fx , x-LAP, 3, z-LAP) = ( get_Field_LAP(f , x, LAP+6, z) - get_Field_LAP(f , x, LAP, z)
+		 					          -9.0*(get_Field_LAP(f , x, LAP+5, z) - get_Field_LAP(f , x, LAP+1, z) )
+							         +45.0*(get_Field_LAP(f , x, LAP+4, z) - get_Field_LAP(f , x, LAP+2, z)) )/(60.0*hy_d);
+
+	}
+}
+
+__global__ void OCFD_Dy0_bound_kernel_p(cudaField f , cudaField fx , cudaJobPackage job){
+	// eyes on cells WITHOUT LAP
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(z < job.end.z && x < job.end.x){
+		unsigned int tmp = ny_d+LAP-1;
+		get_Field(fx, x-LAP, ny_d-1, z-LAP) = -( get_Field_LAP(f , x, tmp-1, z) - get_Field_LAP(f ,x, tmp, z))/hy_d;
+		get_Field(fx, x-LAP, ny_d-2, z-LAP) = -( get_Field_LAP(f , x, tmp-2, z) - get_Field_LAP(f ,x, tmp, z))*0.5/hy_d;
+
+		get_Field(fx, x-LAP, ny_d-3, z-LAP) = -( get_Field_LAP(f , x, tmp, z) - 8.0*get_Field_LAP(f, x, tmp-1, z) 
+							          + 8.0*get_Field_LAP(f , x, tmp-3, z) -     get_Field_LAP(f, x, tmp-4, z))/(12.0*hy_d);
+
+		get_Field(fx, x-LAP, ny_d-4, z-LAP) = -( get_Field_LAP(f , x, tmp-6, z) - get_Field_LAP(f , x, tmp, z)
+		 					          -9.0*(get_Field_LAP(f , x, tmp-5, z) - get_Field_LAP(f , x, tmp-1, z) )
+							         +45.0*(get_Field_LAP(f , x, tmp-4, z) - get_Field_LAP(f , x, tmp-2, z)) )/(60.0*hy_d);
+
+	}
+}
+
+// =========================================================================================================== //
+
+
+__global__ void OCFD_Dz0_bound_kernel_m(cudaField f , cudaField fx , cudaJobPackage job){
+	// eyes on cells WITHOUT LAP
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+
+	if(y < job.end.y && x < job.end.x){
+		get_Field(fx , x-LAP,y-LAP, 0) = ( get_Field_LAP(f , x,y, LAP+1) - get_Field_LAP(f ,x,y, LAP))/hz_d;
+		get_Field(fx , x-LAP,y-LAP, 1) = ( get_Field_LAP(f , x,y, LAP+2) - get_Field_LAP(f ,x,y, LAP))*0.5/hz_d;
+
+		get_Field(fx , x-LAP,y-LAP, 2) = ( get_Field_LAP(f , x, y , LAP  ) - 8.0*get_Field_LAP(f , x,y, LAP+1) 
+							   + 8.0*get_Field_LAP(f , x, y , LAP+3) -     get_Field_LAP(f , x,y, LAP+4))/(12.0*hz_d);
+
+		get_Field(fx , x-LAP,y-LAP, 3) = ( get_Field_LAP(f , x,y, LAP+6) - get_Field_LAP(f , x,y, LAP)
+		 					   -9.0*(get_Field_LAP(f , x,y, LAP+5) - get_Field_LAP(f , x,y, LAP+1) )
+							  +45.0*(get_Field_LAP(f , x,y, LAP+4) - get_Field_LAP(f , x,y, LAP+2)) )/(60.0*hz_d);
+
+	}
+}
+
+__global__ void OCFD_Dz0_bound_kernel_p(cudaField f , cudaField fx , cudaJobPackage job){
+	// eyes on cells WITHOUT LAP
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+
+	if(y < job.end.y && x < job.end.x){
+		unsigned int tmp = nz_d+LAP-1;
+		get_Field(fx ,  x-LAP,y-LAP, nz_d - 1)=  ( get_Field_LAP(f , x,y, tmp) -     get_Field_LAP(f , x,y, tmp-1))/hz_d;
+		get_Field(fx ,  x-LAP,y-LAP, nz_d - 2) = ( get_Field_LAP(f , x,y, tmp) -     get_Field_LAP(f , x,y, tmp-2))*0.5/hz_d;
+
+		get_Field(fx ,  x-LAP,y-LAP, nz_d - 3) = ( get_Field_LAP(f , x,y, tmp-4) - 8.0*get_Field_LAP(f , x,y, tmp-3) 
+										      + 8.0*get_Field_LAP(f , x,y, tmp-1) -     get_Field_LAP(f , x,y, tmp  ))/(12.0*hz_d);
+
+		get_Field(fx ,  x-LAP,y-LAP, nz_d - 4) = ( get_Field_LAP(f , x,y, tmp  ) - get_Field_LAP(f , x,y, tmp - 6)
+		 									  -9.0*(get_Field_LAP(f , x,y, tmp-1) - get_Field_LAP(f , x,y, tmp - 5) )
+												+45.0*(get_Field_LAP(f , x,y, tmp-2) - get_Field_LAP(f , x,y, tmp - 4)) )/(60.0*hz_d);
+
+	}
+}
+
+
+
+
+void OCFD_Dx0_bound(cudaField f , cudaField fx , cudaJobPackage job_in , dim3 blockdim_in, cudaStream_t *stream){ 
+	CHECK_X(
+		{
+			CUDA_LAUNCH(( OCFD_Dx0_bound_kernel_m<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
+
+		},{
+			CUDA_LAUNCH(( OCFD_Dx0_bound_kernel_p<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
+		}
+	)
+}
+
+
+void OCFD_Dy0_bound(cudaField f , cudaField fx , cudaJobPackage job_in , dim3 blockdim_in, cudaStream_t *stream){
+	CHECK_Y(
+		{
+			CUDA_LAUNCH(( OCFD_Dy0_bound_kernel_m<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
+		},
+		{
+			CUDA_LAUNCH(( OCFD_Dy0_bound_kernel_p<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
+		}
+	)
+}
+
+
+void OCFD_Dz0_bound(cudaField f , cudaField fx , cudaJobPackage job_in , dim3 blockdim_in, cudaStream_t *stream){
+	CHECK_Z(
+		{
+			CUDA_LAUNCH(( OCFD_Dz0_bound_kernel_m<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
+		},
+		{
+			CUDA_LAUNCH(( OCFD_Dz0_bound_kernel_p<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
+		}
+	)
+}
+
+
+
+void OCFD_bound(dim3 *flagxyzb, int boundp, int boundm, cudaJobPackage job){
+	// eyes on field WITH LAPs
+	dim3 size;
+	jobsize(&job, &size);
+	switch(flagxyzb->x){
+		case 1:
+		case 4:
+        {
+		    if(npx == 0 && job.start.x == LAP && boundp == 1) flagxyzb->y = 1;
+		    if(npx == NPX0-1 && job.end.x == nx_lap && boundm == 1) flagxyzb->y = 4;
+        }
+		break;
+
+		case 2:
+		case 5:
+        {
+		    if(npy == 0 && job.start.y == LAP && boundp == 1) flagxyzb->y = 2;
+		    if(npy == NPY0-1 && job.end.y == ny_lap && boundm == 1) flagxyzb->y = 5;
+        }
+		break;
+
+		case 3:
+		case 6:
+        {
+		    if(npz == 0 && job.start.z == LAP && boundp == 1) flagxyzb->y = 3;
+		    if(npz == NPZ0-1 && job.end.z == nz_lap && boundm == 1) flagxyzb->y = 6;
+        }
+		break;
+	}
+}
+
+/*__device__ int OCFD_bound_scheme_kernel_p(int flag, dim3 flagxyzb, dim3 coords, cudaSoA du, int num, cudaField fx, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
+	unsigned int offset_out = job.start.x + fx.pitch*(job.start.y + ny_d*job.start.z);
+	if(flag != 0){
+		switch(flagxyzb.x){
+			case 4:
+			if(threadIdx.x != 0) get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp_r - tmp_l)/hx_d;
+			break;
+
+			case 5:
+			if(threadIdx.x != 0) get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp_r - tmp_l)/hy_d;
+			break;
+
+			case 6:
+			if(threadIdx.x != 0) get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp_r - tmp_l)/hz_d;
+			break;
+		}
+
+		switch(flagxyzb.y){
+			case 4:
+			{
+				if(coords.x == job.end.x-job.start.x-1){
+					REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+					REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
+
+					get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hx_d;
+
+					return 0;
+
+				}else if(coords.x >= job.end.x-job.start.x-kb1){
+					REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
+
+					get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hx_d;
+
+					return 0;
+				}
+			}
+			break;
+
+			case 5:
+			{
+				if(coords.y == job.end.y-job.start.y-1){
+					REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+					REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
+
+					get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hy_d;
+
+					return 0;
+				
+				}else if(coords.y >= job.end.y-job.start.y-kb1){
+					REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
+
+					get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hy_d;
+
+					return 0;
+				}
+			}
+			break;
+
+			case 6:
+			{
+				if(coords.z == job.end.z-job.start.z-1){
+					REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+					REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
+
+					get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hz_d;
+
+					return 0;
+				
+				}else if(coords.z >= job.end.z-job.start.z-kb1){
+					REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
+
+					get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hz_d;
+
+					return 0;
+				}
+			}
+			break;
+		}
+	}
+
+	return flag;
+}*/
+
+__device__ REAL OCFD_weno5_kernel_P_right(REAL *stencil){
+
+	REAL S2 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+
+	tmp = stencil[0] - 2.0*stencil[1] +     stencil[2]; S2 += 13*tmp*tmp;
+	tmp = stencil[0] - 4.0*stencil[1] + 3.0*stencil[2]; S2 +=  3*tmp*tmp;
+
+	REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
+	REAL q23 = (2.0*stencil[0] - 7.0*stencil[1] + 11.0*stencil[2]);
+	
+	
+	tmp = a2*q23/(6.0*a2);
+
+	return tmp;
+}
+
+
+__device__ REAL OCFD_weno5_kernel_P_lift(REAL *stencil){
+
+	REAL S0 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp =     stencil[2] - 2.0*stencil[3] + stencil[4]; S0 += 13*tmp*tmp;
+	tmp = 3.0*stencil[2] - 4.0*stencil[3] + stencil[4]; S0 +=  3*tmp*tmp;
+
+	REAL a0 = 1.0/((12.0*ep + S0)*(12.0*ep + S0));
+	REAL q03 = (2.0*stencil[2] + 5.0*stencil[3] - stencil[4]);
+	
+	
+	tmp = a0*q03/(6.0*a0);
+
+	return tmp;
+}
+
+
+__device__ REAL OCFD_weno5_kernel_M_right(REAL *stencil){
+
+	REAL S0 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp =     stencil[2] - 2.0*stencil[1] + stencil[0]; S0 += 13*tmp*tmp;
+	tmp = 3.0*stencil[2] - 4.0*stencil[1] + stencil[0]; S0 +=  3*tmp*tmp;
+
+	REAL a0 = 1.0/((12.0*ep + S0)*(12.0*ep + S0));
+	REAL q03 = (2.0*stencil[2] + 5.0*stencil[1] - stencil[0]);
+	
+	
+	tmp = a0*q03/(6.0*a0);
+
+	return tmp;
+}
+
+
+__device__ REAL OCFD_weno5_kernel_M_lift(REAL *stencil){
+
+	REAL S2 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+
+	tmp = stencil[4] - 2.0*stencil[3] +     stencil[2]; S2 += 13*tmp*tmp;
+	tmp = stencil[4] - 4.0*stencil[3] + 3.0*stencil[2]; S2 +=  3*tmp*tmp;
+
+	REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
+	REAL q23 = (2.0*stencil[4] - 7.0*stencil[3] + 11.0*stencil[2]);
+	
+	
+	tmp = a2*q23/(6.0*a2);
+
+	return tmp;
+}
+
+__device__ REAL OCFD_weno5_kernel_P_right_plus(REAL *stencil){
+
+	REAL S1 = 0.0, S2 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp = stencil[1] - 2.0*stencil[2] + stencil[3]; S1 += 13*tmp*tmp;
+	tmp =                  stencil[1] - stencil[3]; S1 +=  3*tmp*tmp;
+
+	REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
+	REAL q13 = (-stencil[1] + 5.0*stencil[2] + 2.0*stencil[3]);
+
+
+	tmp = stencil[0] - 2.0*stencil[1] +     stencil[2]; S2 += 13*tmp*tmp;
+	tmp = stencil[0] - 4.0*stencil[1] + 3.0*stencil[2]; S2 +=  3*tmp*tmp;
+
+	REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
+	REAL q23 = (2.0*stencil[0] - 7.0*stencil[1] + 11.0*stencil[2]);
+	
+	
+	tmp = (a1*q13 + a2*q23)/(6.0*(a1 + a2));
+
+	return tmp;
+}
+
+
+__device__ REAL OCFD_weno5_kernel_P_lift_plus(REAL *stencil){
+
+	REAL S0 = 0.0, S1 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp =     stencil[2] - 2.0*stencil[3] + stencil[4]; S0 += 13*tmp*tmp;
+	tmp = 3.0*stencil[2] - 4.0*stencil[3] + stencil[4]; S0 +=  3*tmp*tmp;
+
+	REAL a0 = 3.0/((12.0*ep + S0)*(12.0*ep + S0));
+	REAL q03 = (2.0*stencil[2] + 5.0*stencil[3] - stencil[4]);
+
+
+	tmp = stencil[1] - 2.0*stencil[2] + stencil[3]; S1 += 13*tmp*tmp;
+	tmp =                  stencil[1] - stencil[3]; S1 +=  3*tmp*tmp;
+
+	REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
+	REAL q13 = (-stencil[1] + 5.0*stencil[2] + 2.0*stencil[3]);
+	
+	
+	tmp = (a0*q03 + a1*q13)/(6.0*(a0 + a1));
+
+	return tmp;
+}
+
+
+__device__ REAL OCFD_weno5_kernel_M_right_plus(REAL *stencil){
+
+	REAL S0 = 0.0, S1 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp =     stencil[2] - 2.0*stencil[1] + stencil[0]; S0 += 13*tmp*tmp;
+	tmp = 3.0*stencil[2] - 4.0*stencil[1] + stencil[0]; S0 +=  3*tmp*tmp;
+
+	REAL a0 = 3.0/((12.0*ep + S0)*(12.0*ep + S0));
+	REAL q03 = (2.0*stencil[2] + 5.0*stencil[1] - stencil[0]);
+
+
+	tmp = stencil[3] - 2.0*stencil[2] + stencil[1]; S1 += 13*tmp*tmp;
+	tmp =                  stencil[3] - stencil[1]; S1 +=  3*tmp*tmp;
+
+	REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
+	REAL q13 = (-stencil[3] + 5.0*stencil[2] + 2.0*stencil[1]);
+	
+	
+	tmp = (a0*q03 + a1*q13)/(6.0*(a0 + a1));
+
+	return tmp;
+}
+
+
+__device__ REAL OCFD_weno5_kernel_M_lift_plus(REAL *stencil){
+
+	REAL S1 = 0.0, S2 = 0.0;
+	REAL tmp;
+	REAL ep = 1e-6;
+
+	tmp = stencil[3] - 2.0*stencil[2] + stencil[1]; S1 += 13*tmp*tmp;
+	tmp =                  stencil[3] - stencil[1]; S1 +=  3*tmp*tmp;
+
+	REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
+	REAL q13 = (-stencil[3] + 5.0*stencil[2] + 2.0*stencil[1]);
+
+
+	tmp = stencil[4] - 2.0*stencil[3] +     stencil[2]; S2 += 13*tmp*tmp;
+	tmp = stencil[4] - 4.0*stencil[3] + 3.0*stencil[2]; S2 +=  3*tmp*tmp;
+
+	REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
+	REAL q23 = (2.0*stencil[4] - 7.0*stencil[3] + 11.0*stencil[2]);
+	
+	
+	tmp = (a1*q13 + a2*q23)/(6.0*(a1 + a2));
+
+	return tmp;
+}
+//tmp = (2.0*stencil[-ka1+1] + 5.0*stencil[-ka1+2] + stencil[-ka1+3])/6.0;
+//tmp = (11.0*stencil[-ka1] - 7.0*stencil[-ka1-1] + 2.0*stencil[-ka1-2])/6.0;
+
+__device__ int OCFD_bound_scheme_kernel_p(REAL* tmp, dim3 flagxyzb, dim3 coords, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
+
+	switch(flagxyzb.y){
+		case 1:
+		{
+			if(coords.x <= -ka1){
+
+				if(coords.x == 0){
+					*tmp = stencil[-ka1+1];
+				}
+
+				if(coords.x == 1){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
+					*tmp = OCFD_weno5_kernel_P_lift(&stencil[-ka1-2]);
+				}
+
+				if(coords.x == 2){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					*tmp = OCFD_weno5_kernel_P_lift_plus(&stencil[-ka1-2]);
+				}
+
+				if(coords.x == 3){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 2:
+		{
+			if(coords.y <= -ka1){
+
+				if(coords.y == 0){
+					*tmp = stencil[-ka1+1];
+				}
+
+				if(coords.y == 1){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
+					*tmp = OCFD_weno5_kernel_P_lift(&stencil[-ka1-2]);
+				}
+
+				if(coords.y == 2){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					*tmp = OCFD_weno5_kernel_P_lift_plus(&stencil[-ka1-2]);
+				}
+
+				if(coords.y == 3){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 3:
+		{
+			if(coords.z <= -ka1){
+
+				if(coords.z == 0){
+					*tmp = stencil[-ka1+1];
+				}
+
+				if(coords.z == 1){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
+					*tmp = OCFD_weno5_kernel_P_lift(&stencil[-ka1-2]);
+				}
+
+				if(coords.z == 2){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					*tmp = OCFD_weno5_kernel_P_lift_plus(&stencil[-ka1-2]);
+				}
+
+				if(coords.z == 3){
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 4:
+		{
+			if(coords.x > job.end.x-job.start.x-kb1){
+
+				if(coords.x == job.end.x-job.start.x-1){
+					//*tmp = OCFD_weno5_kernel_P_right(&stencil[-ka1-2]);
+					*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.x == job.end.x-job.start.x-2){
+					*tmp = OCFD_weno5_kernel_P_right_plus(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.x <= job.end.x-job.start.x-3){
+					*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 5:
+		{
+			if(coords.y > job.end.y-job.start.y-kb1){
+
+				if(coords.y == job.end.y-job.start.y-1){
+					//*tmp = OCFD_weno5_kernel_P_right(&stencil[-ka1-2]);
+					*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.y == job.end.y-job.start.y-2){
+					*tmp = OCFD_weno5_kernel_P_right_plus(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.y <= job.end.y-job.start.y-3){
+					*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 6:
+		{
+			if(coords.z > job.end.z-job.start.z-kb1){
+
+				if(coords.z == job.end.z-job.start.z-1){
+					//*tmp = OCFD_weno5_kernel_P_right(&stencil[-ka1-2]);
+					*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.z == job.end.z-job.start.z-2){
+					*tmp = OCFD_weno5_kernel_P_right_plus(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.z <= job.end.z-job.start.z-3){
+					*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+	}
+
+
+	return 1;
+
+}
+
+
+
+__device__ int OCFD_bound_scheme_kernel_m(REAL* tmp, dim3 flagxyzb, dim3 coords, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
+
+	switch(flagxyzb.y){
+		case 1:
+		{
+			if(coords.x < -ka1){
+
+				if(coords.x == 0){
+					*tmp = OCFD_weno5_kernel_M_lift(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
+				}
+
+				if(coords.x == 1){
+					*tmp = OCFD_weno5_kernel_M_lift_plus(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.x >= 2){
+					*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 2:
+		{
+			if(coords.y < -ka1){
+
+				if(coords.y == 0){
+					*tmp = OCFD_weno5_kernel_M_lift(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
+				}
+
+				if(coords.y == 1){
+					*tmp = OCFD_weno5_kernel_M_lift_plus(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.y >= 2){
+					*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 3:
+		{
+			if(coords.z < -ka1){
+
+				if(coords.z == 0){
+					*tmp = OCFD_weno5_kernel_M_lift(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
+				}
+
+				if(coords.z == 1){
+					*tmp = OCFD_weno5_kernel_M_lift_plus(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				if(coords.z >= 2){
+					*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
+					//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+				}
+
+				return 0;
+			}
+		}
+		break;
+
+		case 4:
+		{
+			{
+				if(coords.x >= job.end.x-job.start.x-kb1-1){
+	
+					if(coords.x == job.end.x-job.start.x-1){
+						*tmp = stencil[-ka1-1];
+					}
+	
+					if(coords.x == job.end.x-job.start.x-2){
+						//*tmp = OCFD_weno5_kernel_M_right(&stencil[-ka1-2]);
+						*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+					}
+	
+					if(coords.x == job.end.x-job.start.x-3){
+						*tmp = OCFD_weno5_kernel_M_right_plus(&stencil[-ka1-2]);
+						//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					}
+	
+					if(coords.x == job.end.x-job.start.x-4){
+						//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+						*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
+					}
+	
+					return 0;
+				}
+			}
+		}
+		break;
+
+		case 5:
+		{
+			{
+				if(coords.y >= job.end.y-job.start.y-kb1-1){
+	
+					if(coords.y == job.end.y-job.start.y-1){
+						*tmp = stencil[-ka1-1];
+					}
+	
+					if(coords.y == job.end.y-job.start.y-2){
+						//*tmp = OCFD_weno5_kernel_M_right(&stencil[-ka1-2]);
+						*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+					}
+	
+					if(coords.y == job.end.y-job.start.y-3){
+						*tmp = OCFD_weno5_kernel_M_right_plus(&stencil[-ka1-2]);
+						//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					}
+	
+					if(coords.y == job.end.y-job.start.y-4){
+						//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+						*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
+					}
+	
+					return 0;
+				}
+			}
+		}
+		break;
+
+		case 6:
+		{
+			{
+				if(coords.z >= job.end.z-job.start.z-kb1-1){
+	
+					if(coords.z == job.end.z-job.start.z-1){
+						*tmp = stencil[-ka1-1];
+					}
+	
+					if(coords.z == job.end.z-job.start.z-2){
+						//*tmp = OCFD_weno5_kernel_M_right(&stencil[-ka1-2]);
+						*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
+					}
+	
+					if(coords.z == job.end.z-job.start.z-3){
+						*tmp = OCFD_weno5_kernel_M_right_plus(&stencil[-ka1-2]);
+						//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+					}
+	
+					if(coords.z == job.end.z-job.start.z-4){
+						//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
+						*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
+					}
+	
+					return 0;
+				}
+			}
+		}
+		break;
+	}
+
+	return 1;
+
+}
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_boundary.cu
+++ b/src/OCFD_boundary.cu
+//-------Boundary condition --------------------------------------------------------
+#include "stdlib.h"
+#include "stdio.h"
+#include "parameters.h"
+#include "utility.h"
+#include "OCFD_boundary_Liftbody3D.h"
+#include "OCFD_boundary_compression_conner.h"
+
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+#include "commen_kernel.h"
+#include "parameters_d.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+void OCFD_bc()
+{
+	
+	//---------------------------------------------------
+    switch(IBC_USER){
+        case 124:
+		if(Init_stat == 0){
+    		bc_user_Liftbody3d_simple();
+    	}else{
+    		bc_user_Liftbody3d();
+    	}
+		break;
+
+		case 108:
+		bc_user_Compression_conner();
+		break;
+
+		default:
+		break;
+	}
+
+	//--------------------------------------------
+	if (npx == 0)
+	{	
+		dim3 griddim , blockdim;
+		cudaJobPackage job( dim3(0 , 0 , 0) , dim3(1 , ny , nz) );
+
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , 1 , ny , nz);
+		pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
+	}
+	if (npx == NPX0 - 1)
+	{
+		dim3 griddim , blockdim;
+		cudaJobPackage job( dim3(nx-1 , 0 , 0) , dim3(nx , ny , nz) );
+
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , 1 , ny , nz);
+		pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
+	}
+	//------------------------------
+	if (npy == 0)
+	{
+		dim3 griddim , blockdim;
+		cudaJobPackage job( dim3(0 , 0 , 0) , dim3(nx , 1 , nz) );
+
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , 1 , nz);
+		pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
+	}
+
+	if (npy == NPY0 - 1)
+	{
+		dim3 griddim , blockdim;
+		cudaJobPackage job( dim3(0 , ny-1 , 0) , dim3(nx , ny , nz) );
+
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , 1 , nz);
+		pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
+	}
+	//--------------------------
+	if (npz == 0)
+	{
+		dim3 griddim , blockdim;
+		cudaJobPackage job( dim3(0 , 0 , 0) , dim3(nx , ny , 1) );
+
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , 1);
+		pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
+	}
+
+	if (npz == NPZ0 - 1)
+	{
+		dim3 griddim , blockdim;
+		cudaJobPackage job( dim3(0 , 0 , nz-1) , dim3(nx , ny , nz) );
+
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , 1);
+		pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
+	}
+}
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/OCFD_boundary_Liftbody3D.cu
+++ b/src/OCFD_boundary_Liftbody3D.cu
+// Boundary condition for flow over a  3D Liftbody -------------------------
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "parameters.h"
+#include "parameters_d.h"
+#include "utility.h"
+#include "io_warp.h"
+#include "cuda_commen.h"
+#include "cuda_utility.h"
+
+//#include "OCFD_boundary_init.h"
+#include "OCFD_init.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+extern cudaField *pu2d_inlet_d; //[5][nz][ny]
+extern cudaField *pu2d_upper_d; //[5][ny][nx]
+//extern cudaField *pv_dist_wall_d; // [ny][nx]
+extern cudaField *pv_dist_coeff_d; // [3][ny][nx]
+extern cudaField *pu_dist_upper_d; // [ny][nx]
+
+extern const char v_dist_need;
+extern const char TW_postive;
+
+extern REAL *fait;
+extern REAL *TM;
+
+__global__ void do_u2d_inlet_kernel(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField inlet , cudaJobPackage job){
+	// with LAPs
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	if(y < job.end.y && z < job.end.z){
+		unsigned int ylap = y-LAP;
+		unsigned int zlap = z-LAP;
+		for(int i = 0; i <= LAP; i++){
+			get_Field_LAP(d, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *  zlap);
+			get_Field_LAP(u, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 1*nz_d) );
+			get_Field_LAP(v, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 2*nz_d) );
+			get_Field_LAP(w, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 3*nz_d) );
+			get_Field_LAP(T, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 4*nz_d) );
+		}
+	}
+}
+
+/* ================================= */
+
+__global__ void do_u2d_upper_kernel(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField upper , cudaField dist , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	if(x < job.end.x && y < job.end.y){
+		unsigned int xlap = x-LAP;
+		unsigned int ylap = y-LAP;
+		unsigned int ztmp = nz_lap_d - 1;
+		get_Field_LAP(d , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *  ylap );
+		get_Field_LAP(u , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 1*ny_d) ) + *(dist.ptr + xlap + dist.pitch * ylap );
+		get_Field_LAP(v , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 2*ny_d) );
+		get_Field_LAP(w , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 3*ny_d) );
+		get_Field_LAP(T , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 4*ny_d) );
+	}
+}
+
+__global__ void do_u_dist_upper_kernel(REAL sin_aoa , REAL cos_aoa , cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField dist , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	if(x < job.end.x && y < job.end.y){
+		unsigned int xlap = x-LAP;
+		unsigned int ylap = y-LAP;
+		unsigned int ztmp = nz_lap_d - 1;
+		get_Field_LAP(d , x , y , ztmp) = 1.0;
+		get_Field_LAP(u , x , y , ztmp) =  cos_aoa + *(dist.ptr + xlap + dist.pitch * ylap );
+		get_Field_LAP(v , x , y , ztmp) = 0.0;
+		get_Field_LAP(w , x , y , ztmp) = sin_aoa;
+		get_Field_LAP(T , x , y , ztmp) = 1.0;
+	}
+}
+
+/* ============================================= */
+__global__ void do_symmetry_kernel_m(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		unsigned int ys = 2*LAP - y;
+
+		get_Field_LAP(d, x, y, z) = get_Field_LAP(d, x, ys, z); 
+		get_Field_LAP(u, x, y, z) = get_Field_LAP(u, x, ys, z); 
+		get_Field_LAP(v, x, y, z) = -1.0*get_Field_LAP(v, x, ys, z); 
+		get_Field_LAP(w, x, y, z) = get_Field_LAP(w, x, ys, z); 
+		get_Field_LAP(T, x, y, z) = get_Field_LAP(T, x, ys, z); 
+	}
+}
+
+__global__ void do_symmetry_kernel_p(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+	if(x < job.end.x && y < job.end.y && z < job.end.z){
+		unsigned int ys = 2*(ny_lap_d - 1) - y;
+
+		get_Field_LAP(d, x, y, z) = get_Field_LAP(d, x, ys, z); 
+		get_Field_LAP(u, x, y, z) = get_Field_LAP(u, x, ys, z); 
+		get_Field_LAP(v, x, y, z) = -1.0*get_Field_LAP(v, x, ys, z); 
+		get_Field_LAP(w, x, y, z) = get_Field_LAP(w, x, ys, z); 
+		get_Field_LAP(T, x, y, z) = get_Field_LAP(T, x, ys, z); 
+	}
+}
+
+/* =============================================== */
+__global__ void do_wall_kernel_T_V(REAL tw , cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , REAL HT , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	if(x < job.end.x && y < job.end.y){
+		{
+			unsigned int xlap = x-LAP;
+			unsigned int ylap = y-LAP;
+			get_Field_LAP(u , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *  ylap ) * HT;
+			get_Field_LAP(v , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 1*ny_d) ) * HT;
+			get_Field_LAP(w , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 2*ny_d) ) * HT;
+		}
+		get_Field_LAP(T , x , y , LAP) = tw;
+		get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x,y,LAP+1) * get_Field_LAP(T , x , y , LAP+1) -  get_Field_LAP(d , x,y,LAP+2) * get_Field_LAP(T , x , y , LAP+2))/(3.0*tw);
+
+	}
+}
+__global__ void do_wall_kernel_NT_V(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , REAL HT , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	if(x < job.end.x && y < job.end.y){
+		{
+			unsigned int xlap = x-LAP;
+			unsigned int ylap = y-LAP;
+			get_Field_LAP(u , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *  ylap ) * HT;
+			get_Field_LAP(v , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 1*ny_d) ) * HT;
+			get_Field_LAP(w , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 2*ny_d) ) * HT;
+		}
+		get_Field_LAP(T , x , y , LAP) = (4.0 * get_Field_LAP(T , x , y , LAP+1) - get_Field_LAP(T , x , y , LAP+2))/3.0;
+		get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x , y , LAP+1) - get_Field_LAP(d , x , y , LAP+2))/3.0;
+
+	}
+}
+__global__ void do_wall_kernel_T_NV(REAL tw , cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	if(x < job.end.x && y < job.end.y){
+		
+		get_Field_LAP(u , x , y , LAP) = 0.0;
+		get_Field_LAP(v , x , y , LAP) = 0.0;
+		get_Field_LAP(w , x , y , LAP) = 0.0;
+
+		get_Field_LAP(T , x , y , LAP) = tw;
+		get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x,y,LAP+1) * get_Field_LAP(T , x , y , LAP+1) -  get_Field_LAP(d , x,y,LAP+2) * get_Field_LAP(T , x , y , LAP+2))/(3.0*tw);
+
+	}
+}
+
+__global__ void do_wall_kernel_NT_NV(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	if(x < job.end.x && y < job.end.y){
+
+		get_Field_LAP(u , x , y , LAP) = 0.0;
+		get_Field_LAP(v , x , y , LAP) = 0.0;
+		get_Field_LAP(w , x , y , LAP) = 0.0;
+
+		get_Field_LAP(T , x , y , LAP) = (4.0 * get_Field_LAP(T , x , y , LAP+1) - get_Field_LAP(T , x , y , LAP+2))/3.0;
+		get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x , y , LAP+1) - get_Field_LAP(d , x , y , LAP+2))/3.0;
+
+	}
+}
+
+/*  ---------------------------------------------- */
+/* ======================================== */
+void bc_user_Liftbody3d(){
+	//-------------- boundary condition at i=1  (inlet) -----------------------------------------
+	if (npx == 0)
+	{
+		if (IF_WITHLEADING == 1)
+		{
+			printf(" Lift body with leading is not support yet \n");
+			exit(EXIT_FAILURE);
+		}
+		else
+		{ //  without leading
+			dim3 blockdim , griddim;
+			cal_grid_block_dim(&griddim , &blockdim , 1 , BlockDimY , BlockDimZ , 1 , ny , nz);
+
+			cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(LAP+1, ny_lap ,nz_lap) );
+
+			CUDA_LAUNCH(( do_u2d_inlet_kernel<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pu2d_inlet_d , job) ));
+
+		}
+	}
+
+	//---------------------bounrary at k=nz (upper) ------------------------------------------
+	if (npz == NPZ0 - 1)
+	{
+		if (IFLAG_UPPERBOUNDARY == 0)
+		{								 // Out of blow shock
+			dim3 blockdim , griddim;
+			cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+
+			cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+
+			CUDA_LAUNCH(( do_u_dist_upper_kernel<<<griddim , blockdim>>>( Sin_AOA , Cos_AOA ,*pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pu_dist_upper_d, job) ));
+		}
+		else if (IFLAG_UPPERBOUNDARY == 1)
+		{ // In the blow shock
+			dim3 blockdim , griddim;
+			cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+
+			cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+
+			CUDA_LAUNCH(( do_u2d_upper_kernel<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pu2d_upper_d , *pu_dist_upper_d, job) ));
+		}
+	}
+
+	//---------------------wall ------------------------------------
+
+    REAL ht = 0.;
+
+    if(BETA > 0.){
+        for(int m = 0; m < MTMAX; m++){
+            //ht = ht + TM[m] * sin((m + 1)*BETA*tt + 2.*PI*fait[m]);
+            ht = ht + TM[m] * sin((m + 1)*BETA*tt);
+        }
+    }else{
+        ht = 1.;
+    }
+
+	if (npz == 0)
+	{
+		if(v_dist_need){
+			if(TW_postive){
+				dim3 blockdim , griddim;
+				cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+		
+				cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+		
+				CUDA_LAUNCH(( do_wall_kernel_T_V<<<griddim , blockdim>>>( TW , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d , ht , job) ));
+			}else{
+				dim3 blockdim , griddim;
+				cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+		
+				cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+		
+				CUDA_LAUNCH(( do_wall_kernel_NT_V<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d, ht , job) ));
+			}
+		}else{
+			if(TW_postive){
+				dim3 blockdim , griddim;
+				cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+		
+				cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+		
+				CUDA_LAUNCH(( do_wall_kernel_T_NV<<<griddim , blockdim>>>(TW , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d , job) ));
+			}else{
+				dim3 blockdim , griddim;
+				cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+		
+				cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+		
+				CUDA_LAUNCH(( do_wall_kernel_NT_NV<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d , job) ));
+			}
+		}
+	}
+	//------------------------------------------------------------
+	//------------ Symmetry -----------
+	if (npy == 0 && IF_SYMMETRY == 1)
+	{	
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
+
+		cudaJobPackage job( dim3(LAP,0,LAP) , dim3(nx_lap, LAP ,nz_lap) );
+
+		CUDA_LAUNCH(( do_symmetry_kernel_m<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+
+	}
+
+	if (npy == NPY0 - 1 && IF_SYMMETRY == 1)
+	{
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
+
+		cudaJobPackage job( dim3(LAP,ny_lap,LAP) , dim3(nx_lap, ny_2lap ,nz_lap) );
+
+		CUDA_LAUNCH(( do_symmetry_kernel_p<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+	}
+}
+
+
+
+/* =============================================================================== */
+
+__global__ void simple_boundary_condition(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z ){
+		
+		get_Field_LAP(d , x , y , z) = 1.0;
+		get_Field_LAP(u , x , y , z) = 1.0;
+		get_Field_LAP(v , x , y , z) = 0.0;
+		get_Field_LAP(w , x , y , z) = 0.0;
+		get_Field_LAP(T , x , y , z) = 1.0;
+	}
+}
+
+__global__ void out_boundary_condition(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
+	// with LAPs
+	unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
+	unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
+	unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
+
+	if(x < job.end.x && y < job.end.y && z < job.end.z ){
+		
+		get_Field_LAP(d , x , y , z) = get_Field_LAP(d , x-1 , y , z);
+		get_Field_LAP(u , x , y , z) = get_Field_LAP(u , x-1 , y , z);
+		get_Field_LAP(v , x , y , z) = get_Field_LAP(v , x-1 , y , z);
+		get_Field_LAP(w , x , y , z) = get_Field_LAP(w , x-1 , y , z);
+		get_Field_LAP(T , x , y , z) = get_Field_LAP(T , x-1 , y , z);
+	}
+}
+
+void bc_user_Liftbody3d_simple(){
+	//-------------- boundary condition at i=1  (inlet) -----------------------------------------
+	if (npx == 0)
+	{
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , 1 , BlockDimY , BlockDimZ , 1 , ny , nz);
+
+		cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(LAP+1, ny_lap ,nz_lap) );
+
+		CUDA_LAUNCH(( simple_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+	}
+
+    if (npx == NPX0 - 1)
+	{
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , 1 , BlockDimY , BlockDimZ , 1 , ny , nz);
+
+		cudaJobPackage job( dim3(nx_lap,LAP,LAP) , dim3(nx_lap+1, ny_lap ,nz_lap) );
+
+		CUDA_LAUNCH(( out_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+	}
+
+	//---------------------bounrary at k=nz (upper) ------------------------------------------
+	if (npz == NPZ0 - 1)
+	{
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+
+		cudaJobPackage job( dim3(LAP,LAP,nz_lap - 1) , dim3(nx_lap, ny_lap ,nz_lap) );
+
+		CUDA_LAUNCH(( simple_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+	}
+
+	//---------------------wall ------------------------------------
+
+
+
+	if (npz == 0)
+	{
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
+
+		cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
+
+		CUDA_LAUNCH(( simple_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+	}
+	//------------------------------------------------------------
+	//------------ Symmetry -----------
+	if (npy == 0 && IF_SYMMETRY == 1)
+	{	
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
+
+		cudaJobPackage job( dim3(LAP,0,LAP) , dim3(nx_lap, LAP ,nz_lap) );
+
+		CUDA_LAUNCH(( do_symmetry_kernel_m<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+
+	}
+
+	if (npy == NPY0 - 1 && IF_SYMMETRY == 1)
+	{
+		dim3 blockdim , griddim;
+		cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
+
+		cudaJobPackage job( dim3(LAP,ny_lap,LAP) , dim3(nx_lap, ny_2lap ,nz_lap) );
+
+		CUDA_LAUNCH(( do_symmetry_kernel_p<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
+	}
+}
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file