Commit c0b0318b authored by ccfd's avatar ccfd
Browse files

first commit

parents
#ifndef __TEST_H
#define __TEST_H
#include <stdlib.h>
#include <stdio.h>
#include "parameters.h"
void write_block_me(char * name , REAL * u , int nx , int ny , int nz);
void write_block_me1(char * name , REAL * u , int nx , int ny , int nz);
void write_block_me2(char * name , REAL * u , int nx , int ny , int nz);
void write_block_me3(char * name , REAL * u , int nx , int ny , int nz);
#endif
#ifndef __UTILITY_H
#define __UTILITY_H
#include <stdlib.h>
#include <stdio.h>
#include "parameters.h"
#include "config_parameters.h"
#define PTR2ARRAY2(ptr,nx,ny) (REAL(*)[ny][nx])(ptr)
#define PTR2ARRAY3(ptr,nx,ny,nz) (REAL(*)[nz][ny][nx])(ptr)
#define MAX(a,b) (a>b? a : b)
#define MIN(a,b) (a<b? a : b)
#define PROCIdx2Num(proc_i , proc_j , proc_k) (proc_i + proc_j*NPX0 + proc_k*NPX0*NPY0)
#ifdef __cplusplus
extern "C"{
#endif
#define malloc_me_Host(p, size) malloc_me_Host_((void**)&p , size , __FUNCTION__ , __FILE__ ,__LINE__)
void malloc_me_Host_(void **p, int size , const char * funname , const char * file , int line);
#define malloc_me(size) malloc_me_(size , __FUNCTION__ , __FILE__ ,__LINE__)
void * malloc_me_(int size , const char * funname , const char * file , int line);
#ifdef __cplusplus
}
#endif
#endif
\ No newline at end of file
#nonw
HOST_NAME=$(shell hostname)
SRC=$(wildcard src/*.c)
SRC+=$(wildcard src/*.cu)
HEAD=$(wildcard head/*.h)
OBJ=$(patsubst src/%c,obj/%o, $(SRC) )
OBJ:=$(patsubst src/%cu,obj/%o, $(OBJ) )
all: default
ifneq ($(shell which hipcc),)
# HIP compoler
#ifndef MPICH
#$(error env MPICH doesn't exist , MPI_PATH has wrong value)
#endif
#ifndef HIPCC
#$(error env HIP doesn't exist , DEV_PATH has wrong value)
#endif
#MPI_PATH=/opt/hpc/software/mpi/hpcx/v2.4.1/
MPI_PATH=/opt/hpc/software/mpi/hpcx/v2.7.4/gcc-7.3.1/
DEV_PATH=/opt/rocm/hip/
DEV=hipcc
HOST=mpicxx
OPT_Commen=-O3
OPT_Host=-c -std=c99 -I $(DEV_PATH)/include -I $(DEV_PATH)/include/hip/hcc_detail/cuda -D __HIP_PLATFORM_HCC__ -D __HIPCC__
OPT_Host+= $(OPT_Commen)
OPT_Dev=-c -I /usr/include/x86_64-linux-gnu/mpich
OPT_Dev+=$(OPT_Commen)
SRC:=$(patsubst src/%.c,src_hip/%.c, $(SRC))
SRC:=$(patsubst src/%.cu,src_hip/%.cpp, $(SRC))
HEAD:=$(patsubst head/%.h,head_hip/%.h, $(HEAD))
.PRECIOUS : %.o %_hip.c %_hip.cpp %_hip.h
opencfd_hip.c : opencfd.c
hipify-perl $< > $@
opencfd.o : opencfd_hip.c $(HEAD)
$(HOST) $(OPT_Host) -I head_hip/ -o opencfd.o opencfd_hip.c
hip_file : $(HEAD) $(SRC)
head_hip/%.h : head/%.h
@if [ ! -e "head_hip" ] ; then mkdir head_hip ; fi
hipify-perl $< > $@
src_hip/%.c : src/%.c
@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
hipify-perl $< > $@
src_hip/%.cpp : src/%.cu
@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
hipify-perl $< > $@
src_hip/%.cpp : ana/%.c
@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
hipify-perl $< > $@
src_hip/%.cpp : ana/%.cu
@if [ ! -e "src_hip" ] ; then mkdir src_hip ; fi
hipify-perl $< > $@
obj/%.o : src_hip/%.c head_hip/%.h
@if [ ! -e "obj" ] ; then mkdir obj ; fi
$(HOST) $(OPT_Host) -I head_hip/ $< -o $@
obj/%.o : src_hip/%.cpp head_hip/%.h
@if [ ! -e "obj" ] ; then mkdir obj ; fi
$(DEV) $(OPT_Dev) -I head_hip/ $< -o $@
clean:
rm -f *.o *.out obj/*.o obj/*.a src_hip/* head_hip/* opencfd_hip.c
else
#nvcc compiler
MPI_PATH=/usr/
#MPI_PATH=/home/dglin/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/
DEV_PATH=/usr/local/cuda-11.6/
#ifndef MPICH
#$(error env MPICH doesn't exist , MPI_PATH has wrong value)
#endif
#ifndef CUDA
#$(error env CUDA doesn't exist , DEV_PATH has wrong value)
#endif
#
#
#MPI_PATH=$(MPICH)
#DEV_PATH=$(CUDA)
DEV=nvcc
HOST=$(MPI_PATH)/bin/mpicc
OPT_Commen=-g
OPT_Host=-c -std=c99 -I $(DEV_PATH)/include
OPT_Host+= $(OPT_Commen)
OPT_Dev=-dc -I /usr/include/x86_64-linux-gnu/mpich
OPT_Dev+=$(OPT_Commen) -G -code=sm_75 -arch=compute_75
opencfd.o : opencfd.c
$(HOST) $(OPT_Host) -I head/ -o opencfd.o opencfd.c
obj/%.o : src/%.c head/%.h
@if [ ! -e "obj" ] ; then mkdir obj ; fi
$(HOST) $(OPT_Host) -I head/ $< -o $@
obj/%.o : src/%.cu head/%.h
@if [ ! -e "obj" ] ; then mkdir obj ; fi
$(DEV) $(OPT_Dev) -I head/ $< -o $@
clean:
rm -f *.o *.out obj/*.o obj/*.a
endif
default : opencfd.o obj/libocfd.a
$(DEV) -O3 -o opencfd-scu.out opencfd.o -L obj -locfd -L $(MPI_PATH)/lib -lmpi -lm -lpthread
obj/libocfd.a : $(OBJ)
ar -crv obj/libocfd.a $(OBJ)
ZIP_EXIST=0
zip :
@if [ -e "src_cuda.zip" ] ; then rm src_cuda.zip ; echo "rm src_cuda.zip"; fi
zip --quiet -r src_cuda.zip head/ src/ test/ opencfd.c makefile README
echo:
@echo $(HOST_NAME)
@echo $(SRC)
@echo $(OBJ)
@echo $(HEAD)
@echo $(A)
#OpenCFD-SCU-Ver2.00 input file, Dglin, 2021-05
GRID_3D = 25 240 20
PARALLEL_3D = 1 1 1
STREAM = 1
CHARTERIC = 1
TEST = 0
IPERIODIC = 0 0 1
JAC_BOUND = 1 1 1
DIF_BOUND = 1 1 1 1 0 0
NON_REFLETION = 0 1 0 1 0 0
SCHEME_INVIS = WENO7_SYMBO
SCHEME_VIS = CD8
#SCHEME_INVIS = SCHEME_HYBRIDAUTO
#HY_DP_INTV = 1.0 8.0
#HY_STYLE = 1
#HY_SMOOTH_DP = 1
#HY_PATCH_ZONE = 1
#HY_ZONE0 = 10 25 100 240 5 20 20.0
RE = 5581.4
AMA = 2.9
GAMMA = 1.40
PR = 0.70
T_REF = 108.1
EPSL_SW = 0.0
DT = 0.01
END_TIME = 2000
KSTEP_SHOW = 1
KSTEP_SAVE = 10000
INIT_STAT = 1
IBC = 108
#mzmax, mtmax, Inlet_boundary, If_wall_not_normal
BC_NPARA = 10 5 1 0
#Tw, epsl, x_dis_begin, x_dis_end, beta, x_wall_begin, x_up_bound_begin, SLZ
BC_RPARA = 2.84 0.2 -320. -300 0.1 -400. -50. 14.
#nstep_filter, Filter_X, Filter_Y, Filter_Z, ib, ie, jb, je, kb, ke, Filter_scheme // s0, rth, Filter_end_time
#FILTER_NPARA0 = 100 1 1 1 0 25 0 240 0 20 2
#FILTER_RPARA0 = 1.0 1.e-5 1000000
#ANA_EVENT0 = 100 10
#ANA_NPARA0 = 0
#ANA_RPARA0 = 0
#*****************************************************************************************************
# IBC nk nr IF_symmetry IF_withleading Iflag_upperboundary AoA Tw epsl_wall epsl_upper wall_dis_begin wall_dis_end //(liftbody)
124 3 6 1 0 1 0. 3.797 0.02 0. 50. 60.
//----------------------------------------------------------------------------------------------------------------------------------------
// OpenCFD-SC , 3-D compressible Navier-Stokes Finite difference Solver
// Copyright by LI Xinliang, LHD, Institute of Mechanics, CAS, Email: lixl@imech.ac.cn
//
// The default code is double precision computation
// If you want to use SINGLE PRECISION computation, you can change "OCFD_REAL_KIND=8" to "OCFD_REAL_KIND=4" ,
// and "OCFD_DATA_TYPE=MPI_DOUBLE_PRECISION" to "OCFD_DATA_TYPE=MPI_REAL" in the file OpenCFD.h
//----------------------------------------------------------------------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include "mpi.h"
#include "utility.h"
#include "parameters.h"
#include "OCFD_NS_Solver.h"
#include "OCFD_mpi.h"
#include "OCFD_init.h"
#include "cuda_commen.h"
#include "OCFD_mpi_dev.h"
#include "OCFD_filtering.h"
#ifdef __cplusplus
extern "C"{
#endif
int main(int argc, char *argv[]){
mpi_init(&argc , &argv);
read_parameters();
opencfd_mem_init_mpi();
part();
set_para_filtering();
opencfd_mem_init_all();
cuda_commen_init();
init();
NS_solver_real();
opencfd_mem_finalize_all();
mpi_finalize();
return 0;
}
#ifdef __cplusplus
}
#endif
##################################################################################################
OpenCFD-SCU-V1.00 CopyRight by Li-Xinliang, LHD, Institute of Mechanics, CAS (lixl@imech.ac.cn)
Coded by Liu-Shiwei, ICMSEC, Academy of Mathematics and Systems Science, CAS (liusw@lsec.cc.ac.cn)
Coded by Dang-Guanlin, LHD, Institute of Mechanics, CAS (dangguanlin@imech.ac.cn) 2020-01
Mesh(Nx,Ny,Nz): (250,240,90)
3D Partation: 1*1*1 Total procs=1
Re=1.000000 , Ma=8.000000 , Gamma=1.000000 , dt=1950.000000
Start Computing ......
##################################################################################################
OpenCFD-SCU-V1.00 CopyRight by Li-Xinliang, LHD, Institute of Mechanics, CAS (lixl@imech.ac.cn)
Coded by Liu-Shiwei, ICMSEC, Academy of Mathematics and Systems Science, CAS (liusw@lsec.cc.ac.cn)
Coded by Dang-Guanlin, LHD, Institute of Mechanics, CAS (dangguanlin@imech.ac.cn) 2020-01
Mesh(Nx,Ny,Nz): (25,240,20)
3D Partation: 1*1*1 Total procs=1
Re=5581.400000 , Ma=2.900000 , Gamma=1.400000 , dt=0.010000
Start Computing ......
//----------------------------------------------------------------------------------------------------------------------------------------
// OpenCFD-SC , 3-D compressible Navier-Stokes Finite difference Solver
// Copyright by LI Xinliang, LHD, Institute of Mechanics, CAS, Email: lixl@imech.ac.cn
//
// The default code is double precision computation
// If you want to use SINGLE PRECISION computation, you can change "OCFD_REAL_KIND=8" to "OCFD_REAL_KIND=4" ,
// and "OCFD_DATA_TYPE=MPI_DOUBLE_PRECISION" to "OCFD_DATA_TYPE=MPI_REAL" in the file OpenCFD.h
//----------------------------------------------------------------------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include "mpi.h"
#include "utility.h"
#include "parameters.h"
#include "OCFD_NS_Solver.h"
#include "OCFD_mpi.h"
#include "OCFD_init.h"
#include "cuda_commen.h"
#include "OCFD_mpi_dev.h"
#include "OCFD_filtering.h"
#ifdef __cplusplus
extern "C"{
#endif
int main(int argc, char *argv[]){
mpi_init(&argc , &argv);
read_parameters();
opencfd_mem_init_mpi();
part();
set_para_filtering();
opencfd_mem_init_all();
cuda_commen_init();
init();
NS_solver_real();
opencfd_mem_finalize_all();
mpi_finalize();
return 0;
}
#ifdef __cplusplus
}
#endif
/*--------- This code runs only at the initial times -----------------------
读入计算网格 (Axx, Ayy, Azz), 计算Jocaiban 系数;
该程序仅在初始化阶段运行
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include "parameters.h"
#include "utility.h"
#include "OCFD_Comput_Jacobian3d.h"
#include "OCFD_Schemes_Choose.h"
#include "OCFD_mpi.h"
#include "OCFD_IO.h"
#include "parameters_d.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#include "OCFD_mpi_dev.h"
#include "commen_kernel.h"
#include "math.h"
#include "OCFD_ana.h"
#ifdef __cplusplus
extern "C"{
#endif
void Init_Jacobian3d()
{
// init with unit
cuda_mem_value_init_warp(1.0 , pAjac_d->ptr , pAjac_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAxx_d->ptr , pAxx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAyy_d->ptr , pAyy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAzz_d->ptr , pAzz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAkx_d->ptr , pAkx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAky_d->ptr , pAky_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAkz_d->ptr , pAkz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAix_d->ptr , pAix_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAiy_d->ptr , pAiy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAiz_d->ptr , pAiz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAsx_d->ptr , pAsx_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAsy_d->ptr , pAsy_d->pitch , nx_2lap , ny_2lap , nz_2lap);
cuda_mem_value_init_warp(1.0 , pAsz_d->ptr , pAsz_d->pitch , nx_2lap , ny_2lap , nz_2lap);
{
REAL * tmp;
int tmp_size = (nx+2*LAP)*(ny+2*LAP)*(nz+2*LAP);
tmp = pAxx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAyy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAzz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAkx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAky ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAkz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAix ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAiy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAiz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAsx ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAsy ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAsz ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
tmp = pAjac ;for(int i=0;i<tmp_size;i++) (*tmp++) = 1.0;
}
// -------------------------------------------------------------------------
char filename1[100];
MPI_File tmp_file;
sprintf(filename1, "OCFD3d-Jacobi.dat");
if(Init_stat == 0){
int i,j,k;
int klap , jlap,ilap;
int i_off , j_off , k_off;
int i_real, j_real, k_real;
REAL r , d_r;
REAL theta , d_theta , theta_0;
REAL r0 = 1.0;
REAL dr = 1.0;
d_theta = PI / NY_GLOBAL;
theta_0 = -PI*0.5;
d_r = dr / NZ_GLOBAL;
i_off = i_offset[npx];
j_off = j_offset[npy];
k_off = k_offset[npz];
for(k = 0;k<nz;k++){
klap = k+LAP;
k_real = k + k_off;
r = r0 + d_r * k_real;
for(j=0;j<ny;j++){
jlap = j+LAP;
j_real = j + j_off;
theta = theta_0 + d_theta * j_real;
for(i=0;i<nx;i++){
ilap = i+LAP;
i_real = i + i_off;
*(pAxx + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = i_real * hx;
*(pAyy + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r * cos(theta);
*(pAzz + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r * sin(theta);
}
}
}
if(npy == NPY0 - 1){
jlap = ny - 1 + LAP;
for(k = 0; k<nz ; k++){
klap = k+LAP;
k_real = k + k_off;
r = r0 + d_r * k_real;
for(i=0;i<nx;i++){
ilap = i+LAP;
*(pAyy + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = 0.0;
*(pAzz + ilap + nx_2lap*jlap + nx_2lap*ny_2lap*klap) = r;
}
}
}
}else if(access(filename1, F_OK) == -1){
if(my_id == 0) printf("read 3D mesh data: OCFD3d-Mesh.dat ...\n");
MPI_File_open(MPI_COMM_WORLD, "OCFD3d-Mesh.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
MPI_Offset offset = 0;
read_3d1(tmp_file, offset, pAxx);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAyy);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAzz);
MPI_File_close(&tmp_file);
exchange_boundary_xyz(pAxx);
exchange_boundary_xyz(pAyy);
exchange_boundary_xyz(pAzz);
memcpy_All(pAxx , pAxx_d->ptr , pAxx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAyy , pAyy_d->ptr , pAyy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAzz , pAzz_d->ptr , pAzz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
Comput_Jacobian3d();
}else{
//The file not exist
if(my_id == 0) printf("OCFD3d-Jacobi.dat is exit\nread 3D Jacobi data ...... ");
MPI_File tmp_file;
MPI_File_open(MPI_COMM_WORLD, "OCFD3d-Jacobi.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
MPI_Offset offset = 0;
read_3d1(tmp_file, offset, pAxx);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAyy);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAzz);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAkx);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAky);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAkz);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAix);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAiy);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAiz);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAsx);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAsy);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAsz);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pAjac);
MPI_File_close(&tmp_file);
exchange_boundary_xyz(pAxx);
exchange_boundary_xyz(pAyy);
exchange_boundary_xyz(pAzz);
exchange_boundary_xyz(pAkx);
exchange_boundary_xyz(pAky);
exchange_boundary_xyz(pAkz);
exchange_boundary_xyz(pAix);
exchange_boundary_xyz(pAiy);
exchange_boundary_xyz(pAiz);
exchange_boundary_xyz(pAsx);
exchange_boundary_xyz(pAsy);
exchange_boundary_xyz(pAsz);
exchange_boundary_xyz(pAjac);
memcpy_All(pAxx , pAxx_d->ptr , pAxx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAyy , pAyy_d->ptr , pAyy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAzz , pAzz_d->ptr , pAzz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAkx , pAkx_d->ptr , pAkx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAky , pAky_d->ptr , pAky_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAkz , pAkz_d->ptr , pAkz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAix , pAix_d->ptr , pAix_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAiy , pAiy_d->ptr , pAiy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAiz , pAiz_d->ptr , pAiz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAsx , pAsx_d->ptr , pAsx_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAsy , pAsy_d->ptr , pAsy_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAsz , pAsz_d->ptr , pAsz_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pAjac , pAjac_d->ptr , pAjac_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
}
ana_Jac();
}
void Comput_Jacobian3d(){
boundary_Jac3d_Axx(); //only using the boudary condition for Axx, Ayy, Azz
if (my_id == 0)
printf("Comput Jacobian 3D data ...\n");
comput_Jac3d();
if (my_id == 0)
printf("Comput Jacobian 3D data OK\n");
// ---------------
exchange_boundary_xyz_packed_dev(pAkx , pAkx_d);
exchange_boundary_xyz_packed_dev(pAky , pAky_d);
exchange_boundary_xyz_packed_dev(pAkz , pAkz_d);
exchange_boundary_xyz_packed_dev(pAix , pAix_d);
exchange_boundary_xyz_packed_dev(pAiy , pAiy_d);
exchange_boundary_xyz_packed_dev(pAiz , pAiz_d);
exchange_boundary_xyz_packed_dev(pAsx , pAsx_d);
exchange_boundary_xyz_packed_dev(pAsy , pAsy_d);
exchange_boundary_xyz_packed_dev(pAsz , pAsz_d);
exchange_boundary_xyz_packed_dev(pAjac , pAjac_d);
boundary_Jac3d_Liftbody_Ajac(); //boudary condition for Axx, Ayy, Azz, Aix, Aiy, Aiz , ......
}
// ----------------------------------------------------------------------------
__global__ void comput_Jac3d_kernal(
cudaField xi,
cudaField xj,
cudaField xk,
cudaField yi,
cudaField yj,
cudaField yk,
cudaField zi,
cudaField zj,
cudaField zk,
cudaField Akx,
cudaField Aky,
cudaField Akz,
cudaField Aix,
cudaField Aiy,
cudaField Aiz,
cudaField Asx,
cudaField Asy,
cudaField Asz,
cudaField Ajac,
cudaJobPackage job
){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL xi1, xj1, xk1, yi1, yj1, yk1, zi1, zj1, zk1, Jac1;
xi1 = get_Field(xi, x, y, z);
xj1 = get_Field(xj, x, y, z);
xk1 = get_Field(xk, x, y, z);
yi1 = get_Field(yi, x, y, z);
yj1 = get_Field(yj, x, y, z);
yk1 = get_Field(yk, x, y, z);
zi1 = get_Field(zi, x, y, z);
zj1 = get_Field(zj, x, y, z);
zk1 = get_Field(zk, x, y, z);
Jac1 = 1.0 / (xi1 * yj1 * zk1 + yi1 * zj1 * xk1 + zi1 * xj1 * yk1 - zi1 * yj1 * xk1 - yi1 * xj1 * zk1 - xi1 * zj1 * yk1); //1./Jocabian = d(x,y,z)/d(i,j,k)
get_Field_LAP(Ajac , x+LAP , y+LAP , z+LAP) = Jac1;
get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP) = (yj1 * zk1 - zj1 * yk1);
get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP) = (zj1 * xk1 - xj1 * zk1);
get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP) = (xj1 * yk1 - yj1 * xk1);
get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP) = (yk1 * zi1 - zk1 * yi1);
get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP) = (zk1 * xi1 - xk1 * zi1);
get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP) = (xk1 * yi1 - yk1 * xi1);
get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP) = (yi1 * zj1 - zi1 * yj1);
get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP) = (zi1 * xj1 - xi1 * zj1);
get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP) = (xi1 * yj1 - yi1 * xj1);
if(x == 0){
for(int i = 0; i < LAP; i++){
get_Field_LAP(Ajac, i, y+LAP, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
get_Field_LAP(Akx, i, y+LAP, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aky, i, y+LAP, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Akz, i, y+LAP, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aix, i, y+LAP, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiy, i, y+LAP, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiz, i, y+LAP, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asx, i, y+LAP, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asy, i, y+LAP, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asz, i, y+LAP, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
}
}
if(x == job.end.x-1){
for(int i = 1; i <= LAP; i++){
get_Field_LAP(Ajac, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
get_Field_LAP(Akx, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aky, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Akz, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aix, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiy, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiz, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asx, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asy, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asz, x+LAP+i, y+LAP, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
}
}
if(y == 0){
for(int i = 0; i < LAP; i++){
get_Field_LAP(Ajac, x+LAP, i, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
get_Field_LAP(Akx, x+LAP, i, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aky, x+LAP, i, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Akz, x+LAP, i, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aix, x+LAP, i, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiy, x+LAP, i, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiz, x+LAP, i, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asx, x+LAP, i, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asy, x+LAP, i, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asz, x+LAP, i, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
}
}
if(y == job.end.y-1){
for(int i = 1; i <= LAP; i++){
get_Field_LAP(Ajac, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
get_Field_LAP(Akx, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aky, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Akz, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aix, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiy, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiz, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asx, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asy, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asz, x+LAP, y+LAP+i, z+LAP) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
}
}
if(z == 0){
for(int i = 0; i < LAP; i++){
get_Field_LAP(Ajac, x+LAP, y+LAP, i) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
get_Field_LAP(Akx, x+LAP, y+LAP, i) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aky, x+LAP, y+LAP, i) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Akz, x+LAP, y+LAP, i) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aix, x+LAP, y+LAP, i) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiy, x+LAP, y+LAP, i) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiz, x+LAP, y+LAP, i) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asx, x+LAP, y+LAP, i) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asy, x+LAP, y+LAP, i) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asz, x+LAP, y+LAP, i) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
}
}
if(z == job.end.z-1){
for(int i = 1; i <= LAP; i++){
get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Ajac, x+LAP , y+LAP , z+LAP);
get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP+i) = get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
}
}
}
}
void comput_Jac3d()
{
cudaField xi; xi.ptr = puk_d->ptr; xi.pitch = puk_d->pitch;
cudaField xj; xj.ptr = pui_d->ptr; xj.pitch = pui_d->pitch;
cudaField xk; xk.ptr = pus_d->ptr; xk.pitch = pus_d->pitch;
cudaField yi; yi.ptr = pvk_d->ptr; yi.pitch = pvk_d->pitch;
cudaField yj; yj.ptr = pvi_d->ptr; yj.pitch = pvi_d->pitch;
cudaField yk; yk.ptr = pvs_d->ptr; yk.pitch = pvs_d->pitch;
cudaField zi; zi.ptr = pwk_d->ptr; zi.pitch = pwk_d->pitch;
cudaField zj; zj.ptr = pwi_d->ptr; zj.pitch = pwi_d->pitch;
cudaField zk; zk.ptr = pws_d->ptr; zk.pitch = pws_d->pitch;
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap, nz_lap) );
OCFD_dx0_jac(*pAxx_d, xi, job, BlockDim_X, &Stream[0], Jacbound[0]);
OCFD_dx0_jac(*pAyy_d, yi, job, BlockDim_X, &Stream[0], Jacbound[0]);
OCFD_dx0_jac(*pAzz_d, zi, job, BlockDim_X, &Stream[0], Jacbound[0]);
OCFD_dy0_jac(*pAxx_d, xj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
OCFD_dy0_jac(*pAyy_d, yj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
OCFD_dy0_jac(*pAzz_d, zj, job, BlockDim_Y, &Stream[0], Jacbound[1]);
OCFD_dz0_jac(*pAxx_d, xk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
OCFD_dz0_jac(*pAyy_d, yk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
OCFD_dz0_jac(*pAzz_d, zk, job, BlockDim_Z, &Stream[0], Jacbound[2]);
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX/8 , BlockDimY , BlockDimZ , nx,ny,nz);
job.setup( dim3(0,0,0) , dim3(nx,ny,nz) );
CUDA_LAUNCH(( comput_Jac3d_kernal<<<griddim , blockdim>>>(xi,xj,xk,yi,yj,yk,zi,zj,zk,*pAkx_d,
*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,job) ));
}
// ------------------------------------------------------------------------
// Symmetry bounary at j=1 & j=ny_global
__global__ void boundary_Jac3d_kernal_y_r(cudaField pA, REAL value, cudaJobPackage job){
unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
if( x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(pA, x, y, z) = value*get_Field_LAP(pA, x, 2*(ny_lap_d-1) - y, z);
}
}
__global__ void boundary_Jac3d_kernal_y_l(cudaField pA, REAL value, cudaJobPackage job){
unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
if( x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(pA, x, y, z) = value*get_Field_LAP(pA, x, 2*LAP - y, z);
}
}
__global__ void boundary_Jac3d_kernal_y_ramp_wall_kernel(
cudaField xx,
cudaField yy,
cudaField Akx,
cudaField Aky,
cudaField Akz,
cudaField Aix,
cudaField Aiy,
cudaField Aiz,
cudaField Asx,
cudaField Asy,
cudaField Asz,
cudaField Ajac,
REAL seta,
cudaJobPackage job){
unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
if( x < job.end.x && y < job.end.y && z < job.end.z){
if( get_Field_LAP(xx, x, LAP, z) <= 0.0){
get_Field_LAP(Ajac, x, y, z) = get_Field_LAP(Ajac, x, 2*LAP-y, z);
get_Field_LAP(Akx, x, y, z) = get_Field_LAP(Akx, x, 2*LAP-y, z);
get_Field_LAP(Aky, x, y, z) = -get_Field_LAP(Aky, x, 2*LAP-y, z);
get_Field_LAP(Akz, x, y, z) = get_Field_LAP(Akz, x, 2*LAP-y, z);
get_Field_LAP(Aix, x, y, z) = -get_Field_LAP(Aix, x, 2*LAP-y, z);
get_Field_LAP(Aiy, x, y, z) = get_Field_LAP(Aiy, x, 2*LAP-y, z);
get_Field_LAP(Aiz, x, y, z) = -get_Field_LAP(Aiz, x, 2*LAP-y, z);
get_Field_LAP(Asx, x, y, z) = get_Field_LAP(Asx, x, 2*LAP-y, z);
get_Field_LAP(Asy, x, y, z) = -get_Field_LAP(Asy, x, 2*LAP-y, z);
get_Field_LAP(Asz, x, y, z) = get_Field_LAP(Asz, x, 2*LAP-y, z);
}else{
REAL dx = get_Field_LAP(xx, x, 2*LAP-y, z) - get_Field_LAP(xx, x, 2*LAP-y-1, z);
REAL dy = get_Field_LAP(yy, x, 2*LAP-y, z) - get_Field_LAP(yy, x, 2*LAP-y-1, z);
REAL tmpxx = fabs(-cos(2*seta) + sin(2*seta)*dy/dx);
REAL tmpyy = fabs(cos(2*seta) + dx/dy*sin(2*seta));
get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpyy*get_Field_LAP(Ajac, x, 2*LAP-y, z);
get_Field_LAP(Akx, x, y, z) = tmpyy*get_Field_LAP(Akx, x, 2*LAP-y, z);
get_Field_LAP(Aky, x, y, z) = -tmpxx*get_Field_LAP(Aky, x, 2*LAP-y, z);
get_Field_LAP(Akz, x, y, z) = tmpxx*tmpyy*get_Field_LAP(Akz, x, 2*LAP-y, z);
get_Field_LAP(Aix, x, y, z) = -tmpyy*get_Field_LAP(Aix, x, 2*LAP-y, z);
get_Field_LAP(Aiy, x, y, z) = tmpxx*get_Field_LAP(Aiy, x, 2*LAP-y, z);
get_Field_LAP(Aiz, x, y, z) = -tmpxx*tmpyy*get_Field_LAP(Aiz, x, 2*LAP-y, z);
get_Field_LAP(Asx, x, y, z) = tmpyy*get_Field_LAP(Asx, x, 2*LAP-y, z);
get_Field_LAP(Asy, x, y, z) = -tmpxx*get_Field_LAP(Asy, x, 2*LAP-y, z);
get_Field_LAP(Asz, x, y, z) = tmpxx*tmpyy*get_Field_LAP(Asz, x, 2*LAP-y, z);
}
}
}
void boundary_Jac3d_kernal_y_ramp_wall(REAL seta){
if (npy == 0)
{
seta = seta/PI;
dim3 griddim , blockdim;
cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_ramp_wall_kernel<<<griddim , blockdim>>>(*pAxx_d,*pAyy_d,*pAkx_d,
*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,seta,job) ));
}
}
__global__ void boundary_Jac3d_kernal_z_cone_wall_kernel(
cudaField xx,
cudaField zz,
cudaField Akx,
cudaField Aky,
cudaField Akz,
cudaField Aix,
cudaField Aiy,
cudaField Aiz,
cudaField Asx,
cudaField Asy,
cudaField Asz,
cudaField Ajac,
REAL seta1,
REAL seta2,
cudaJobPackage job){
unsigned int x = blockDim.x*blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y*blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z*blockIdx.z + threadIdx.z + job.start.z;
if( x < job.end.x && y < job.end.y && z < job.end.z){
if( get_Field_LAP(xx, x, y, LAP) <= 0.0){
REAL dx = get_Field_LAP(xx, x, y, 2*LAP-z) - get_Field_LAP(xx, x, y, 2*LAP-z-1);
REAL dz = get_Field_LAP(zz, x, y, 2*LAP-z) - get_Field_LAP(zz, x, y, 2*LAP-z-1);
REAL tmpxx = fabs(-cos(2*seta1) + sin(2*seta1)*dz/dx);
REAL tmpzz = fabs(cos(2*seta1) + dx/dz*sin(2*seta1));
get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Ajac, x, y, 2*LAP-z);
get_Field_LAP(Akx, x, y, z) = tmpzz*get_Field_LAP(Akx, x, y, 2*LAP-z);
get_Field_LAP(Aky, x, y, z) = -tmpxx*tmpzz*get_Field_LAP(Aky, x, y, 2*LAP-z);
get_Field_LAP(Akz, x, y, z) = -tmpxx*get_Field_LAP(Akz, x, y, 2*LAP-z);
get_Field_LAP(Aix, x, y, z) = -tmpzz*get_Field_LAP(Aix, x, y, 2*LAP-z);
get_Field_LAP(Aiy, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Aiy, x, y, 2*LAP-z);
get_Field_LAP(Aiz, x, y, z) = tmpxx*get_Field_LAP(Aiz, x, y, 2*LAP-z);
get_Field_LAP(Asx, x, y, z) = -tmpzz*get_Field_LAP(Asx, x, y, 2*LAP-z);
get_Field_LAP(Asy, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Asy, x, y, 2*LAP-z);
get_Field_LAP(Asz, x, y, z) = tmpxx*get_Field_LAP(Asz, x, y, 2*LAP-z);
}else{
REAL dx = get_Field_LAP(xx, x, y, 2*LAP-z) - get_Field_LAP(xx, x, y, 2*LAP-z-1);
REAL dz = get_Field_LAP(zz, x, y, 2*LAP-z) - get_Field_LAP(zz, x, y, 2*LAP-z-1);
REAL tmpxx = fabs(-cos(2*(seta1+seta2)) + sin(2*(seta1+seta2))*dz/dx);
REAL tmpzz = fabs(cos(2*(seta1+seta2)) + dx/dz*sin(2*(seta1+seta2)));
get_Field_LAP(Ajac, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Ajac, x, y, 2*LAP-z);
get_Field_LAP(Akx, x, y, z) = tmpzz*get_Field_LAP(Akx, x, y, 2*LAP-z);
get_Field_LAP(Aky, x, y, z) = -tmpxx*tmpzz*get_Field_LAP(Aky, x, y, 2*LAP-z);
get_Field_LAP(Akz, x, y, z) = -tmpxx*get_Field_LAP(Akz, x, y, 2*LAP-z);
get_Field_LAP(Aix, x, y, z) = -tmpzz*get_Field_LAP(Aix, x, y, 2*LAP-z);
get_Field_LAP(Aiy, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Aiy, x, y, 2*LAP-z);
get_Field_LAP(Aiz, x, y, z) = tmpxx*get_Field_LAP(Aiz, x, y, 2*LAP-z);
get_Field_LAP(Asx, x, y, z) = -tmpzz*get_Field_LAP(Asx, x, y, 2*LAP-z);
get_Field_LAP(Asy, x, y, z) = tmpxx*tmpzz*get_Field_LAP(Asy, x, y, 2*LAP-z);
get_Field_LAP(Asz, x, y, z) = tmpxx*get_Field_LAP(Asz, x, y, 2*LAP-z);
}
}
}
void boundary_Jac3d_kernal_z_cone_wall(REAL seta1, REAL seta2){
if (npz == 0)
{
seta1 = seta1/PI;
seta2 = seta2/PI;
dim3 griddim , blockdim;
cudaJobPackage job( dim3(LAP, LAP, 0) , dim3(nx_lap, ny_lap, LAP) );
CUDA_LAUNCH(( boundary_Jac3d_kernal_z_cone_wall_kernel<<<griddim , blockdim>>>(*pAxx_d,*pAzz_d,*pAkx_d,
*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,seta1,seta2,job) ));
}
}
void boundary_Jac3d_Axx()
{
if(IF_SYMMETRY == 1){
if (npy == 0)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAxx_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAyy_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAzz_d , 1.0 , job) ));
}
if (npy == NPY0 - 1)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAxx_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAyy_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAzz_d , 1.0 , job) ));
}
}
}
void boundary_Jac3d_Liftbody_Ajac()
{
if(IF_SYMMETRY == 1){
if (npy == 0)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAkx_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAky_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAkz_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAix_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAiy_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAiz_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAsx_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAsy_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAsz_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_l<<<griddim , blockdim>>>(*pAjac_d , 1.0 , job) ));
}
if (npy == NPY0 - 1)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz );
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAkx_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAky_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAkz_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAix_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAiy_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAiz_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAsx_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAsy_d ,-1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAsz_d , 1.0 , job) ));
CUDA_LAUNCH(( boundary_Jac3d_kernal_y_r<<<griddim , blockdim>>>(*pAjac_d , 1.0 , job) ));
}
}
}
#ifdef __cplusplus
}
#endif
//Read & save file
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include "mpi.h"
#include "OCFD_ana.h"
#include "parameters.h"
#include "utility.h"
#include "OCFD_IO.h"
#include "OCFD_IO_mpi.h"
#include "io_warp.h"
#include "parameters_d.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#include "commen_kernel.h"
#ifdef __cplusplus
extern "C"{
#endif
void read_file(
int Iflag_av,
REAL * pd,
REAL * pu,
REAL * pv,
REAL * pw,
REAL * pT)
{
// Iflag_av == 0 , read opencfd data file; ==1, read averaged file
int Irestart_step;
char filename1[100];
//-----------------------------------------------------------
if(Iflag_av == 0){
Irestart_step = -1;
if (my_id == 0)
{
FILE *tmp_file;
if (tmp_file = fopen("Opencfd.msg", "r"))
{
fread(&Irestart_step, sizeof(int), 1, tmp_file);
fclose(tmp_file);
}
else
{
printf("Opencfd.msg is not exist, read initial file : opencfd.dat ......\n");
}
}
MPI_Bcast(&Irestart_step, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (Irestart_step < 0)
{
sprintf(filename1, "opencfd.dat");
}
else
{
sprintf(filename1, "OCFD%08d.dat", Irestart_step);
}
MPI_File tmp_file;
int tmp[3];
if(my_id == 0) printf("read initial data file: %s \n\n", filename1);
MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
MPI_File_read_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
MPI_File_read_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
Istep = tmp[0];
tt = *(REAL*)(tmp+1);
if(my_id == 0) printf("Istep=%d , tt=%lf\n", Istep, tt);
read_3d1(tmp_file, offset, pd);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pu);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pv);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pw);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pT);
MPI_File_close(&tmp_file);
//------------------------
if (my_id == 0)
printf("read data ok\n");
}
//--------------------
if(Iflag_av == 1)
{
// averaged file
//char *tmp_char = strstr(filename1, ".dat");
sprintf(filename1, "opencfd.average");
if (access(filename1, F_OK) == -1){
//The file not exist
if(my_id == 0) printf("Average file: %s is not exit\n\n", filename1);
Istep_average = 0;
tt_average = 0.0;
init_time_average();
}else{
if (my_id == 0)
printf("read average_data begin\n");
MPI_File tmp_file;
MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_RDONLY, MPI_INFO_NULL, &tmp_file);
int tmp[3];
MPI_File_read_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
MPI_File_read_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
Istep_average = tmp[0];
tt_average = *(REAL*)(tmp+1);
if(my_id == 0) printf("Istep_average=%d , tt_average=%lf\n", Istep_average, tt_average);
init_time_average();
read_3d1(tmp_file, offset, pdm);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pum);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pvm);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pwm);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
read_3d1(tmp_file, offset, pTm);
MPI_File_close(&tmp_file);
//-- ----------------------
if (my_id == 0)
printf("read average_data ok\n");
memcpy_inner(pdm , pdm_d->ptr , pdm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_inner(pum , pum_d->ptr , pum_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_inner(pvm , pvm_d->ptr , pvm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_inner(pwm , pwm_d->ptr , pwm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
memcpy_inner(pTm , pTm_d->ptr , pTm_d->pitch , H2D , nx_2lap , ny_2lap , nz_2lap);
}
average_IO = 0;
}
//---------------------
}
//----------------------------------------------------------------------------------
//================================================================================
void OCFD_save(
int Iflag_av,
int Istep_name,
REAL * pd,
REAL * pu,
REAL * pv,
REAL * pw,
REAL * pT)
{
// Iflag_av==0, write opencfd file; ==1, write averaged data file
char filename1[120];
//-------------------------------------------
MPI_File tmp_file;
int tmp[3];
int size_tmp = sizeof(tmp);
if(Iflag_av == 0){
sprintf(filename1, "OCFD%08d.dat", Istep_name);
}else{
sprintf(filename1, "OCFD%08d.average", Istep_name);
}
if(my_id == 0) printf("write data file: %s\n", filename1);
MPI_File_open(MPI_COMM_WORLD, filename1, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &tmp_file);
if(Iflag_av == 0){
tmp[0] = Istep;
*(REAL*)(tmp + 1) = tt;
MPI_File_write_at_all(tmp_file, 0, &size_tmp, 1, MPI_INT, &status);
MPI_File_write_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
MPI_File_write_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
MPI_File_write_at_all(tmp_file, 2*sizeof(int)+sizeof(REAL), &size_tmp, 1, MPI_INT, &status);
}else{
tmp[0] = Istep_average;
*(REAL*)(tmp + 1) = tt_average;
MPI_File_write_at_all(tmp_file, 0, &size_tmp, 1, MPI_INT, &status);
MPI_File_write_at_all(tmp_file, sizeof(int), tmp, 1, MPI_INT, &status);
MPI_File_write_at_all(tmp_file, 2*sizeof(int), tmp+1, 1, OCFD_DATA_TYPE, &status);
MPI_File_write_at_all(tmp_file, 2*sizeof(int)+sizeof(REAL), &size_tmp, 1, MPI_INT, &status);
}
MPI_Offset offset = 3*sizeof(int)+sizeof(REAL);
write_3d1(tmp_file, offset, pd);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
write_3d1(tmp_file, offset, pu);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
write_3d1(tmp_file, offset, pv);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
write_3d1(tmp_file, offset, pw);
offset += (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) * NZ_GLOBAL;
write_3d1(tmp_file, offset, pT);
MPI_File_close(&tmp_file);
//if (my_id == 0)
//{
// if (Iflag_av == 0)
// {
// printf("write data OK\n");
// tmp_file = fopen("Opencfd.msg", "a");
// fprintf(tmp_file, "%d", Istep_name);
// fclose(tmp_file);
// }
//}
}
//-------------------------------------------------------------------------------------------
//---------------------------------------------------------------------------------------------
void write_3d1(
MPI_File file,
MPI_Offset offset,
REAL *pU)
{
int i, j, k;
REAL(*U)
[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
REAL(*U1)
[ny][nx] = (REAL(*)[ny][nx])malloc(nx * ny * nz * sizeof(REAL));
REAL *pU1 = (REAL*)U1;
for (k = LAP; k < nz + LAP; k++)
{
for (j = LAP; j < ny + LAP; j++)
{
for (i = LAP; i < nx + LAP; i++)
{
(*pU1++) = U[k][j][i];
}
}
}
pU1 = &(U1[0][0][0]);
write_3d(file, offset, pU1);
free(U1);
}
void read_3d1(
MPI_File file,
MPI_Offset offset,
REAL *pU)
{
int i, j, k;
REAL(*U1)
[ny][nx] = (REAL(*)[ny][nx])malloc(nx * ny * nz * sizeof(REAL));
REAL(*U)
[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
REAL *pU1 = (REAL*)U1;
read_3d(file, offset, pU1);
for (k = LAP; k < nz + LAP; k++)
{
for (j = LAP; j < ny + LAP; j++)
{
for (i = LAP; i < nx + LAP; i++)
{
U[k][j][i] = (*pU1++);
}
}
}
free(U1);
}
#ifdef __cplusplus
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mpi.h"
#include "parameters.h"
#include "utility.h"
#include "OCFD_mpi.h"
#include "io_warp.h"
//---------------------------------------------------
#ifdef __cplusplus
extern "C"{
#endif
//void write_2d_XYa(
// FILE *file,
// int ka,
// int size_x,
// int size_y,
// int lap,
// int *pU)
//{
//
// int(*U)
// [size_y + 2*lap][size_x + 2*lap] = (int(*)[size_y + 2*lap][size_x + 2*lap])(pU);
// int(*U2d)
// [NX_GLOBAL], (*U0)[NX_GLOBAL];
// int node_k, k_local;
//
// U2d = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
// memset((void*)U2d, 0, NX_GLOBAL * NY_GLOBAL * sizeof(int));
// if (my_id == 0){
// U0 = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
// }
// //--------------------------------
// get_k_node(ka, &node_k, &k_local);
// k_local += lap;
// int i, j;
// if(npz == node_k){
// for (j = lap; j < ny + lap; j++)
// {
// for (i = lap; i < nx + lap; i++)
// {
// U2d[j - lap + j_offset[npy]][i - lap + i_offset[npx]] = U[k_local][j][i];
// }
// }
// }
// MPI_Reduce(&U2d[0][0], &U0[0][0], NX_GLOBAL * NY_GLOBAL, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
//// if (my_id == 0)
//// FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
//
// if(my_id == 0){
// for(j = 0; j < NY_GLOBAL; j++){
// for(i = 0; i < NX_GLOBAL; i++){
// fprintf(file, "%08d\n", U0[j][i]);
// }
// }
// }
//
// free(U2d);
// if (my_id == 0)
// free(U0);
//}
void write_2d_XY(
FILE *file,
int ka,
int size_x,
int size_y,
int lap,
int *pU,
REAL *pU1)
{
int(*U)[size_y + 2*lap][size_x + 2*lap] = (int(*)[size_y + 2*lap][size_x + 2*lap])(pU);
REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
int(*U2d)[NX_GLOBAL], (*U0)[NX_GLOBAL];
REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
int node_k, k_local;
U2d = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
memset((void*)U2d, 0, NX_GLOBAL * NY_GLOBAL * sizeof(int));
memset((void*)U2d1, 0, NX_GLOBAL * NY_GLOBAL * sizeof(REAL));
if (my_id == 0){
U0 = (int(*)[NX_GLOBAL])malloc(sizeof(int) * NX_GLOBAL * NY_GLOBAL);
U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
}
//--------------------------------
get_k_node(ka, &node_k, &k_local);
k_local += lap;
int i, j;
if(npz == node_k){
for (j = lap; j < ny + lap; j++)
{
for (i = lap; i < nx + lap; i++)
{
U2d[j - lap + j_offset[npy]][i - lap + i_offset[npx]] = U[k_local][j][i+1];
}
}
for (j = LAP; j < ny + LAP; j++)
{
for (i = LAP; i < nx + LAP; i++)
{
U2d1[j - LAP + j_offset[npy]][i - LAP + i_offset[npx]] = U1[k_local + LAP][j][i+1];
}
}
}
MPI_Reduce(&U2d[0][0], &U0[0][0], NX_GLOBAL * NY_GLOBAL, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if(my_id == 0){
for(j = 0; j < NY_GLOBAL; j++){
for(i = 0; i < NX_GLOBAL; i++){
fprintf(file, "%08d%15.6lf\n", U0[j][i], U01[j][i]);
}
}
}
free(U2d);
free(U2d1);
if (my_id == 0){
free(U0);
free(U01);
}
}
void write_2d_XYa(
FILE *file,
int ka,
REAL *pU1)
{
REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
int node_k, k_local;
U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
memset((void*)U2d1, 0, NX_GLOBAL * NY_GLOBAL * sizeof(REAL));
if (my_id == 0){
U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
}
//--------------------------------
get_k_node(ka, &node_k, &k_local);
int i, j;
if(npz == node_k){
for (j = LAP; j < ny + LAP; j++)
{
for (i = LAP; i < nx + LAP; i++)
{
U2d1[j - LAP + j_offset[npy]][i - LAP + i_offset[npx]] = U1[k_local + LAP][j][i];
}
}
}
MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
// if (my_id == 0)
// FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
if(my_id == 0) FWRITE(U01, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
free(U2d1);
if (my_id == 0){
free(U01);
}
}
void write_2d_YZa(
FILE *file,
int ia,
REAL *pU1)
{
REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
REAL(*U2d1)[NY_GLOBAL], (*U01)[NY_GLOBAL];
int node_i, i_local;
U2d1 = (REAL(*)[NY_GLOBAL])malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
memset((void*)U2d1, 0, NY_GLOBAL * NZ_GLOBAL * sizeof(REAL));
if (my_id == 0){
U01 = (REAL(*)[NY_GLOBAL])malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
}
//--------------------------------
get_i_node(ia, &node_i, &i_local);
int j, k;
if(npx == node_i){
for (k = LAP; k < nz + LAP; k++)
{
for (j = LAP; j < ny + LAP; j++)
{
U2d1[k - LAP + k_offset[npz]][j - LAP + j_offset[npy]] = U1[k][j][i_local + LAP];
}
}
}
MPI_Reduce(&U2d1[0][0], &U01[0][0], NY_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
// if (my_id == 0)
// FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
if(my_id == 0) FWRITE(U01, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
free(U2d1);
if (my_id == 0){
free(U01);
}
}
void write_2d_XZa(
FILE *file,
int ja,
REAL *pU1)
{
REAL(*U1)[ny + 2*LAP][nx + 2*LAP] = (REAL(*)[ny + 2*LAP][nx + 2*LAP])(pU1);
REAL(*U2d1)[NX_GLOBAL], (*U01)[NX_GLOBAL];
int node_j, j_local;
U2d1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NZ_GLOBAL);
memset((void*)U2d1, 0, NX_GLOBAL * NZ_GLOBAL * sizeof(REAL));
if (my_id == 0){
U01 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NZ_GLOBAL);
}
//--------------------------------
get_j_node(ja, &node_j, &j_local);
int i, k;
if(npy == node_j){
for (k = LAP; k < nz + LAP; k++)
{
for (i = LAP; i < nx + LAP; i++)
{
U2d1[k - LAP + k_offset[npz]][i - LAP + i_offset[npx]] = U1[k][j_local + LAP][i];
}
}
}
MPI_Reduce(&U2d1[0][0], &U01[0][0], NX_GLOBAL * NZ_GLOBAL, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
// if (my_id == 0)
// FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
if(my_id == 0) FWRITE(U01, sizeof(REAL), NX_GLOBAL * NZ_GLOBAL, file)
free(U2d1);
if (my_id == 0){
free(U01);
}
}
//--------------------------------------------------------------
//-----Write a 2D Y-Z (j-k) plane from 3-D array
//void write_2d_YZa(
// FILE *file,
// int ia,
// REAL *pU)
//{
//
// REAL(*U)
// [ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
// REAL(*U2d), (*U0);
// int node_i, i_local;
//
// U2d = (REAL *)malloc(sizeof(REAL) * ny * nz);
// if (my_id == 0)
// U0 = (REAL *)malloc(sizeof(REAL) * NY_GLOBAL * NZ_GLOBAL);
// //--------------------------------
// get_i_node(ia, &node_i, &i_local);
// i_local += LAP;
// int k, j;
// REAL *tmp = U2d;
// for (k = LAP; k < nz + LAP; k++)
// {
// for (j = LAP; j < ny + LAP; j++)
// {
// (*tmp++) = U[k][j][i_local];
// }
// }
//
// for (int proc_k = 0; proc_k < NPZ0; k++)
// {
// for (int kk = k_offset[proc_k]; kk < k_offset[proc_k] + k_nn[proc_k]; kk++)
// {
// for (int proc_j = 0; proc_j < NPY0; proc_j++)
// {
// if (npx == node_i && npy == proc_j && npz == proc_k)
// {
// k = kk - k_offset[proc_k];
// MPI_Bsend(U2d + k * ny, ny, OCFD_DATA_TYPE, 0, kk, MPI_COMM_WORLD);
// }
// if (my_id == 0)
// {
// int recv_offset = j_offset[proc_j] + NY_GLOBAL * kk;
// MPI_Status status;
// MPI_Recv(U0 + recv_offset, j_nn[proc_j], OCFD_DATA_TYPE, PROCIdx2Num(node_i, proc_j, proc_k), kk, MPI_COMM_WORLD, &status);
// }
// }
// MPI_Barrier(MPI_COMM_WORLD);
// }
// }
// if (my_id == 0)
// FWRITE(U0, sizeof(REAL), NY_GLOBAL * NZ_GLOBAL, file)
//
// free(U2d);
// if (my_id == 0)
// free(U0);
//}
//-------------------------------------------------
//----Write a 2d xz-plane from 3d array------------------------
//void write_2d_XZa(
// FILE *file,
// int ja,
// REAL *pU)
//{
//
// REAL(*U)
// [ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
// REAL(*U2d), (*U0);
// int node_j, j_local;
//
// U2d = (REAL *)malloc(sizeof(REAL) * nx * nz);
// if (my_id == 0)
// U0 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
// //--------------------------------
// get_j_node(ja, &node_j, &j_local);
// j_local += LAP;
// int k, i;
// REAL *tmp = U2d;
// for (k = LAP; k < nz + LAP; k++)
// {
// for (i = LAP; i < nx + LAP; i++)
// {
// (*tmp++) = U[k][j_local][i];
// }
// }
// for (int proc_k = 0; proc_k < NPZ0; k++)
// {
// for (int kk = k_offset[proc_k]; kk < k_offset[proc_k] + k_nn[proc_k]; kk++)
// {
// for (int proc_i = 0; proc_i < NPX0; proc_i++)
// {
// if (npy == node_j && npx == proc_i && npz == proc_k)
// {
// k = kk - k_offset[proc_k];
// MPI_Bsend(U2d + k * nx, nx, OCFD_DATA_TYPE, 0, kk, MPI_COMM_WORLD);
// }
// if (my_id == 0)
// {
// int recv_offset = i_offset[proc_i] + NX_GLOBAL * kk;
// MPI_Status status;
// MPI_Recv(U0 + recv_offset, i_nn[proc_i], OCFD_DATA_TYPE, PROCIdx2Num(proc_i, node_j, proc_k), kk, MPI_COMM_WORLD, &status);
// }
// }
// MPI_Barrier(MPI_COMM_WORLD);
// }
// }
// if (my_id == 0)
// FWRITE(U0, sizeof(REAL), NX_GLOBAL * NZ_GLOBAL, file)
//
// free(U2d);
// if (my_id == 0)
// free(U0);
//}
//--------------------------------------------------
//----Write points from 3d array------------------------
// 需要明确外界输入文件中,ia,ja,ka所使用的下标体系
void write_points(
FILE *file,
REAL *pU,
int mpoints,
int *ia,
int *ja,
int *ka)
{
int node_i, node_j, node_k, i_local, j_local, k_local;
REAL(*U)
[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
REAL *U1;
U1 = (REAL *)malloc(sizeof(REAL) * mpoints);
//--------------------------------
for (int m = 0; m < mpoints; m++)
{
get_i_node(ia[m], &node_i, &i_local);
get_j_node(ja[m], &node_j, &j_local);
get_k_node(ka[m], &node_k, &k_local);
if (npx == node_i && npy == node_j && npz == node_k)
{
MPI_Bsend(&U[k_local + LAP][j_local + LAP][i_local + LAP], 1, OCFD_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
}
if (my_id == 0)
{
MPI_Status status;
MPI_Recv(&U1[m], 1, OCFD_DATA_TYPE, PROCIdx2Num(node_i, node_j, node_k), 0, MPI_COMM_WORLD, &status);
}
}
if (my_id == 0)
FWRITE(U1, sizeof(REAL), mpoints, file)
free(U1);
}
//--------------------------------------------------
//--------------------------------------------------
//void read_3d(
// FILE *file,
// REAL *pU)
//{
//
// REAL(*U)
// [ny][nx] = PTR2ARRAY2(pU, nx, ny);
//
// REAL(*buff2d)
// [NX_GLOBAL], (*buff1)[NX_GLOBAL], *buff2, *buff_recv;
// int sendcounts1[NPY0], displs1[NPY0], sendcounts2[NPX0], displs2[NPX0];
// //---------------------------------------------------------------
// if (npx == 0)
// {
// if (npy == 0)
// {
// buff2d = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
// }
// buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
// buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);//NY_GLOBAL > ny
// }
// buff_recv = (REAL *)malloc(sizeof(REAL) * nx * ny);
//
// if (my_id == 0)
// printf("read 3d data ...\n");
// // sendcounts1 displs1用于j方向分布
// for (int j = 0; j < NPY0; j++)
// {
// sendcounts1[j] = NX_GLOBAL * j_nn[j];
// displs1[j] = j_offset[j] * NX_GLOBAL;
// }
//
// for (int i = 0; i < NPX0; i++)
// {
// sendcounts2[i] = ny * i_nn[i];
// displs2[i] = i_offset[i] * ny;
// }
//
// int proc_k, k_local;
// for (int kk = 0; kk < NZ_GLOBAL; kk++)
// {
// get_k_node(kk, &proc_k, &k_local);
// if (my_id == 0)
// FREAD(buff2d, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
//
// if (proc_k != 0)
// {
// // k方向发送
// MPI_Status status;
// if (my_id == 0)
// MPI_Send(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, proc_k * (NPX0 * NPY0), 6666, MPI_COMM_WORLD);
// if (my_id == proc_k * NPX0 * NPY0)
// MPI_Recv(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, 6666, MPI_COMM_WORLD, &status);
// }
// if (npz == proc_k)
// {
// // j方向分散
// if (npx == 0)
// {
// MPI_Scatterv(buff2d, sendcounts1, displs1, OCFD_DATA_TYPE, buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
//
// REAL *pbuff_recv;
// REAL *ppU;
// // i方向数据准备与离散
// for (int npx1 = 0; npx1 < NPX0; npx1++)
// {
// ppU = buff2 + displs2[npx1];
// for (int j = 0; j < ny; j++)
// {
// for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
// {
// (*ppU++) = buff1[j][i];
// }
// }
// }
// }
// //buff_recv = buff2;
// MPI_Scatterv(buff2, sendcounts2, displs2, OCFD_DATA_TYPE, buff_recv, nx * ny, OCFD_DATA_TYPE, 0, MPI_COMM_X);
//
// // 数据分布
// {
// REAL *pbuff_recv;
// REAL *ppU;
// ppU = pU + k_local * nx * ny;
// pbuff_recv = buff_recv;
// for (int nn = 0; nn < nx * ny; nn++)
// {
// (*ppU++) = (*pbuff_recv++);
// }
// }
// }
// }
//
// if (npx == 0)
// {
// if (npy == 0)
// {
// free(buff2d);
// }
// free(buff1);
// free(buff2);
// }
// free(buff_recv);
//}
//void read_3d(
// MPI_File file,
// REAL *pU)
//{
// size_t displs_start, displs_end, displs_k_start, displs_k_end;
//
// REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
//
// displs_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
// displs_end = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * (NZ_GLOBAL-k_offset[npz]-nz);
//
// displs_k_start = sizeof(int) + (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
// displs_k_end = 2*sizeof(int) + (NY_GLOBAL - ny) * NX_GLOBAL * sizeof(REAL);
//
// if (my_id == 0) printf("read 3d data ...\n");
//
// MPI_File_seek(file, displs_start + displs_k_start, MPI_SEEK_CUR);
//
// for(int k=0; k<nz; k++){
//
// MPI_File_read(file, buff_recv, NX_GLOBAL*ny, OCFD_DATA_TYPE, &status);
//
// MPI_File_seek(file, displs_k_end, MPI_SEEK_CUR);
//
// // 数据分布
// {
// REAL *ppU;
// ppU = pU + k * nx * ny;
//
// for(int j=0;j<ny;j++){
// for(int i=0;i<nx;i++){
// *(ppU+j*nx+i) = *(buff_recv+j*NX_GLOBAL+i);
// }
// }
// }
// }
//
// MPI_File_seek(file, displs_end - displs_k_start, MPI_SEEK_CUR);
//
// free(buff_recv);
//}
void read_3d(
MPI_File file,
MPI_Offset offset,
REAL *pU)
{
size_t displs_start, displs_k_start;
REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
displs_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
displs_k_start = sizeof(int) + (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
displs_start += displs_k_start + offset;
if (my_id == 0) printf("read 3d data ...\n");
for(int k=0; k<nz; k++){
MPI_File_read_at(file, displs_start, buff_recv, NX_GLOBAL*ny, OCFD_DATA_TYPE, &status);
displs_start += 2*sizeof(int) + NY_GLOBAL * NX_GLOBAL * sizeof(REAL);
// 数据分布
{
REAL *ppU;
ppU = pU + k * nx * ny;
for(int j=0;j<ny;j++){
for(int i=0;i<nx;i++){
*(ppU+j*nx+i) = *(buff_recv+j*NX_GLOBAL+i);
}
}
}
}
free(buff_recv);
}
//------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------
void write_3d(
MPI_File file,
MPI_Offset offset,
REAL *pU)
{
REAL(*U)[ny][nx] = PTR2ARRAY2(pU, nx, ny);
REAL(*buff1)[NX_GLOBAL], *buff2, *buff_send;
int *buff2d;
size_t size = NX_GLOBAL*NY_GLOBAL*sizeof(REAL);
size_t displs_k;
int recvcounts1[NPY0], displs1[NPY0], recvcounts2[NPX0], displs2[NPX0];
displs_k = k_offset[npz] * (2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL)) + offset;
if (npx == 0)
{
if (npy == 0)
{
buff2d = (int*)malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL + sizeof(int) * 2);
*buff2d = size;
*(buff2d + 1 + NX_GLOBAL * NY_GLOBAL * 2) = size;
}
buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
}
buff_send = (REAL *)malloc(sizeof(REAL) * nx * ny);
//---------------------------------------------------------------
if (my_id == 0)
printf("write 3d data ...\n");
// recvconts1 , displs1 存储j方向收集时所使用的个数与偏移,
// 由于j方向收集发生在i方向收集之后,因此只有一列参与j方向收集
for (int j = 0; j < NPY0; j++)
{
recvcounts1[j] = NX_GLOBAL * j_nn[j];
displs1[j] = j_offset[j] * NX_GLOBAL;
}
// i方向收集所需偏移与数量
for (int i = 0; i < NPX0; i++)
{
recvcounts2[i] = ny * i_nn[i];
displs2[i] = i_offset[i] * ny;
}
// 按数据的k面进行循环
for (int kk = 0; kk < nz; kk++)
{
REAL *pbuff_send = (REAL *)buff_send;
REAL *ppU = pU + kk * nx * ny;
// i方向收集数据准备
for (int n = 0; n < nx * ny; n++)
(*pbuff_send++) = (*ppU++);
MPI_Gatherv(buff_send, nx * ny, OCFD_DATA_TYPE, buff2, recvcounts2, displs2, OCFD_DATA_TYPE, 0, MPI_COMM_X);
if (npx == 0)
{
// j方向收集数据调序
for (int npx1 = 0; npx1 < NPX0; npx1++)
{
ppU = buff2 + displs2[npx1];
for (int j = 0; j < ny; j++)
{
for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
{
buff1[j][i] = (*ppU++);
}
}
}
MPI_Gatherv(buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, (REAL*)(buff2d + 1), recvcounts1, displs1, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
}
if (npx == 0 && npy == 0){
MPI_File_write_at(file, displs_k, buff2d, 2*(NX_GLOBAL*NY_GLOBAL+1), MPI_INT, &status);
}
displs_k += 2*sizeof(int) + NX_GLOBAL * NY_GLOBAL * sizeof(REAL);
}
if (npx == 0)
{
if (npy == 0)
{
free(buff2d);
}
free(buff1);
free(buff2);
}
free(buff_send);
}
//void write_3d(
// MPI_File file,
// REAL *pU)
//{
// size_t displs_xy;
// size_t size = NX_GLOBAL*NY_GLOBAL*sizeof(REAL);
// size_t displs_non0_start = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL)) * k_offset[npz];
// size_t displs_non0_end = (2*sizeof(int) + NX_GLOBAL*NY_GLOBAL*sizeof(REAL))*(NZ_GLOBAL-k_offset[npz]-nz);
//
// REAL *buff_recv = (REAL *)malloc(sizeof(REAL) * nx * ny);
// displs_xy = (i_offset[npx] + j_offset[npy] * NX_GLOBAL) * sizeof(REAL);
//
// if(my_id == 0){
// for(int i=0; i<k_offset[npz]; i++){
// MPI_File_write_all(file, &size, 1, MPI_INT, &status);
// MPI_File_seek(file, size, MPI_SEEK_CUR);
// MPI_File_write_all(file, &size, 1, MPI_INT, &status);
// }
// }else{
// MPI_File_seek(file, displs_non0_start, MPI_SEEK_CUR);
// }
//
// for(int k=0; k<nz; k++){
// // 数据分布
// {
// REAL *ppU;
// ppU = pU + k * nx * ny;
//
// for(int j=0;j<ny;j++){
// for(int i=0;i<nx;i++){
// *(buff_recv+j*nx+i) = *(ppU+j*nx+i);
// }
// }
// }
//
// if(my_id == 0){
// MPI_File_write_all(file, &size, 1, MPI_INT, &status);
// }else{
// MPI_File_seek(file, sizeof(int), MPI_SEEK_CUR);
// }
//
// MPI_File_seek(file, displs_xy, MPI_SEEK_CUR);
//
// for(int j = 0; j < ny; j++){
// MPI_File_write_all(file, buff_recv + nx*j, nx, OCFD_DATA_TYPE, &status);
//
// MPI_File_seek(file, sizeof(REAL)*(NX_GLOBAL-nx), MPI_SEEK_CUR);
// }
//
// MPI_File_seek(file, sizeof(REAL)*((NY_GLOBAL-j_offset[npy]-ny)*NX_GLOBAL-i_offset[npx]), MPI_SEEK_CUR);
//
// if(my_id == 0){
// MPI_File_write_all(file, &size, 1, MPI_INT, &status);
// }else{
// MPI_File_seek(file, sizeof(int), MPI_SEEK_CUR);
// }
//
// }
//
// if(my_id == 0){
// for(int i=0; i<(NZ_GLOBAL-k_offset[npz]-nz); i++){
// MPI_File_write_all(file, &size, 1, MPI_INT, &status);
// MPI_File_seek(file, size, MPI_SEEK_CUR);
// MPI_File_write_all(file, &size, 1, MPI_INT, &status);
// }
// }else{
// MPI_File_seek(file, displs_non0_end, MPI_SEEK_CUR);
// }
//
// if (my_id == 0) printf("write 3d data ...\n");
//
// free(buff_recv);
//}
//------------------------------------------------------------------------------------------
//------------------------------------------------------------------------------------------
//void write_3d(
// FILE *file,
// REAL *pU)
//{
//
// REAL(*U)
// [ny][nx] = PTR2ARRAY2(pU, nx, ny);
// REAL(*buff2d)
// [NX_GLOBAL], (*buff1)[NX_GLOBAL], *buff2, *buff_send;
//
// int recvcounts1[NPY0], displs1[NPY0], recvcounts2[NPX0], displs2[NPX0];
//
// if (npx == 0)
// {
// if (npy == 0)
// {
// buff2d = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * NY_GLOBAL);
// }
// buff1 = (REAL(*)[NX_GLOBAL])malloc(sizeof(REAL) * NX_GLOBAL * ny);
// buff2 = (REAL *)malloc(sizeof(REAL) * NX_GLOBAL * ny);
// }
// buff_send = (REAL *)malloc(sizeof(REAL) * nx * ny);
//
// //---------------------------------------------------------------
// if (my_id == 0)
// printf("write 3d data ...\n");
// // recvconts1 , displs1 存储j方向收集时所使用的个数与偏移,
// // 由于j方向收集发生在i方向收集之后,因此只有一列参与j方向收集
// for (int j = 0; j < NPY0; j++)
// {
// recvcounts1[j] = NX_GLOBAL * j_nn[j];
// displs1[j] = j_offset[j] * NX_GLOBAL;
// }
// // i方向收集所需偏移与数量
// for (int i = 0; i < NPX0; i++)
// {
// recvcounts2[i] = ny * i_nn[i];
// displs2[i] = i_offset[i] * ny;
// }
//
// // 按数据的k面进行循环
// int proc_k, k_local;
// for (int kk = 0; kk < NZ_GLOBAL; kk++)
// {
// get_k_node(kk, &proc_k, &k_local);
// if (npz == proc_k)
// {
// REAL *pbuff_send = (REAL *)buff_send;
// REAL *ppU = pU + k_local * nx * ny;
// // i方向收集数据准备
// for (int n = 0; n < nx * ny; n++)
// (*pbuff_send++) = (*ppU++);
// MPI_Gatherv(buff_send, nx * ny, OCFD_DATA_TYPE, buff2, recvcounts2, displs2, OCFD_DATA_TYPE, 0, MPI_COMM_X);
//
// if (npx == 0)
// {
// // j方向收集数据调序
// for (int npx1 = 0; npx1 < NPX0; npx1++)
// {
// ppU = buff2 + displs2[npx1];
//
// for (int j = 0; j < ny; j++)
// {
// for (int i = i_offset[npx1]; i < i_offset[npx1] + i_nn[npx1]; i++)
// {
// buff1[j][i] = (*ppU++);
// }
// }
// }
// MPI_Gatherv(buff1, NX_GLOBAL * ny, OCFD_DATA_TYPE, buff2d, recvcounts1, displs1, OCFD_DATA_TYPE, 0, MPI_COMM_Y);
// }
// }
//
// //
//
// // k 方向收集
// if (proc_k != 0)
// {
// if (npx == 0 && npy == 0 && npz == proc_k)
// {
// MPI_Send(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, 0, 666, MPI_COMM_WORLD);
// }
// if (my_id == 0)
// {
// MPI_Status status;
// MPI_Recv(buff2d, NX_GLOBAL * NY_GLOBAL, OCFD_DATA_TYPE, proc_k * NPX0 * NPY0, 666, MPI_COMM_WORLD, &status);
// }
// }
// if (my_id == 0)
// FWRITE(buff2d, sizeof(REAL), NX_GLOBAL * NY_GLOBAL, file)
// }
//
// if (npx == 0)
// {
// if (npy == 0)
// {
// free(buff2d);
// }
// free(buff1);
// free(buff2);
// }
// free(buff_send);
//}
//------------------------------------------------------------------------------------------------------------------
//------------------------------------Write blockdata from 3d array-------------------------------------------------
void write_blockdata(
FILE *file,
REAL *pU,
int ib,
int ie,
int jb,
int je,
int kb,
int ke)
{
int nx1 = ie - ib + 1, ny1 = je - jb + 1, nz1 = ke - kb + 1;
int i, j, k, i0, j0, k0, i1, j1, k1;
REAL(*U)
[ny + 2 * LAP][nx + 2 * LAP] = PTR2ARRAY2(pU, nx + 2 * LAP, ny + 2 * LAP);
REAL U1[nz1][ny1][nx1], U0[nz1][ny1][nx1];
//--------------------------------
REAL *p = &U1[0][0][0];
for (int i = 0; i < nx1 * ny1 * nz1; i++)
{
(*p++) = 0.0;
}
p = &U0[0][0][0];
for (int i = 0; i < nx1 * ny1 * nz1; i++)
{
(*p++) = 0.0;
}
// 假设in文件使用fortran下标 , 从1开始
ib -= 1;
jb -= 1;
kb -= 1;
int gkb = k_offset[npz];
int gjb = j_offset[npy];
int gib = i_offset[npx];
for (k = 0; k < nz; k++)
{
k0 = k + gkb;
if (!(k0 >= kb && k0 < ke))
continue;
k1 = k0 - kb;
for (j = 0; j < ny; j++)
{
j0 = j + gjb;
if (!(j0 >= jb && j0 < je))
continue;
j1 = j0 - jb;
for (i = 0; i < nx; i++)
{
i0 = i + gib;
if (!(i0 >= ib && i0 < ie))
continue;
i1 = i0 - ib;
U1[k1][j1][i1] = U[k + LAP][j + LAP][i + LAP];
}
}
}
MPI_Reduce(&U1[0][0][0], &U0[0][0][0], nx1 * ny1 * nz1, OCFD_DATA_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
if (my_id == 0)
FWRITE(&U0[0][0][0], sizeof(REAL), nx1 * ny1 * nz1, file)
}
#ifdef __cplusplus
}
#endif
//--------------------------------------------------
#include <math.h>
#include "OCFD_NS_Jacobian3d.h"
#include "parameters.h"
#include "OCFD_Schemes_Choose.h"
#include "OCFD_split.h"
#include "commen_kernel.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#include "OCFD_mpi_dev.h"
#include "parameters_d.h"
#include "OCFD_flux_charteric.h"
#ifdef __cplusplus
extern "C" {
#endif
void du_invis_Jacobian3d_init(cudaJobPackage job_in, cudaStream_t *stream){
dim3 blockdim , griddim, size;
jobsize(&job_in, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x+2*LAP, size.y+2*LAP, size.z+2*LAP);
cudaJobPackage job( dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP),
dim3(job_in.end.x + LAP, job_in.end.y + LAP, job_in.end.z + LAP) );
CUDA_LAUNCH(( sound_speed_kernel<<<griddim , blockdim, 0, *stream>>>(*pT_d , *pcc_d , job) ));
}
void du_invis_Jacobian3d_x(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, cudaStream_t *stream){
OCFD_dx1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, job_in, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
OCFD_dx2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAkx_d, *pAky_d, *pAkz_d, job_in, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
}
void du_invis_Jacobian3d_y(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, cudaStream_t *stream){
OCFD_dy1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d, job_in, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
OCFD_dy2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAix_d, *pAiy_d, *pAiz_d, job_in, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
}
void du_invis_Jacobian3d_z(cudaJobPackage job_in, cudaSoA *fp, cudaSoA *fm, cudaStream_t *stream){
OCFD_dz1(*fp, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d, job_in, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
OCFD_dz2(*fm, *pdu_d, *pAjac_d, *pu_d, *pv_d, *pw_d, *pcc_d, *pAsx_d, *pAsy_d, *pAsz_d, job_in, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
}
// ========================================================
void du_viscous_Jacobian3d_init(cudaStream_t *stream){
cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
OCFD_dx0(*pu_d, *puk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dx0(*pv_d, *pvk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dx0(*pw_d, *pwk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dx0(*pT_d, *pTk_d, job, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dy0(*pu_d, *pui_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dy0(*pv_d, *pvi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dy0(*pw_d, *pwi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dy0(*pT_d, *pTi_d, job, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dz0(*pu_d, *pus_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
OCFD_dz0(*pv_d, *pvs_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
OCFD_dz0(*pw_d, *pws_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
OCFD_dz0(*pT_d, *pTs_d, job, BlockDim, stream, D0_bound[4], D0_bound[5]);
}
__device__ void vis_flux_s_ker(
vis_flux vf,
REAL *Akx,
REAL *Aix,
REAL *Asx,
REAL *Aky,
REAL *Aiy,
REAL *Asy,
REAL *Akz,
REAL *Aiz,
REAL *Asz,
REAL *Amu,
REAL *s11,
REAL *s12,
REAL *s13,
REAL *s22,
REAL *s23,
REAL *s33,
int x,
int y,
int z
){
REAL ux, vx, wx;
REAL uy, vy, wy;
REAL uz, vz, wz;
REAL div;
REAL uk = get_Field(vf.uk, x-LAP, y-LAP, z-LAP);
REAL ui = get_Field(vf.ui, x-LAP, y-LAP, z-LAP);
REAL us = get_Field(vf.us, x-LAP, y-LAP, z-LAP);
REAL vk = get_Field(vf.vk, x-LAP, y-LAP, z-LAP);
REAL vi = get_Field(vf.vi, x-LAP, y-LAP, z-LAP);
REAL vs = get_Field(vf.vs, x-LAP, y-LAP, z-LAP);
REAL wk = get_Field(vf.wk, x-LAP, y-LAP, z-LAP);
REAL wi = get_Field(vf.wi, x-LAP, y-LAP, z-LAP);
REAL ws = get_Field(vf.ws, x-LAP, y-LAP, z-LAP);
ux=uk* *Akx + ui* *Aix + us* *Asx;
vx=vk* *Akx + vi* *Aix + vs* *Asx;
wx=wk* *Akx + wi* *Aix + ws* *Asx;
uy=uk* *Aky + ui* *Aiy + us* *Asy;
vy=vk* *Aky + vi* *Aiy + vs* *Asy;
wy=wk* *Aky + wi* *Aiy + ws* *Asy;
uz=uk* *Akz + ui* *Aiz + us* *Asz;
vz=vk* *Akz + vi* *Aiz + vs* *Asz;
wz=wk* *Akz + wi* *Aiz + ws* *Asz;
div=ux+vy+wz;
*s11 = (2.0*ux-2.0/3.0*div) * *Amu;
*s22 = (2.0*vy-2.0/3.0*div) * *Amu;
*s33 = (2.0*wz-2.0/3.0*div) * *Amu;
*s12 = (uy+vx)* *Amu;
*s13 = (uz+wx)* *Amu;
*s23 = (vz+wy)* *Amu;
}
__device__ void vis_flux_e_ker(
vis_flux vf,
REAL *Amu,
REAL *Akx,
REAL *Aky,
REAL *Akz,
REAL *Aix,
REAL *Aiy,
REAL *Aiz,
REAL *Asx,
REAL *Asy,
REAL *Asz,
REAL *s11,
REAL *s12,
REAL *s13,
REAL *s22,
REAL *s23,
REAL *s33,
REAL *E1,
REAL *E2,
REAL *E3,
int x,
int y,
int z
){
REAL Tx;
REAL Ty;
REAL Tz;
REAL Amuk;
REAL Tk = get_Field(vf.Tk, x-LAP, y-LAP, z-LAP);
REAL Ti = get_Field(vf.Ti, x-LAP, y-LAP, z-LAP);
REAL Ts = get_Field(vf.Ts, x-LAP, y-LAP, z-LAP);
REAL u = get_Field_LAP(vf.u, x, y, z);
REAL v = get_Field_LAP(vf.v, x, y, z);
REAL w = get_Field_LAP(vf.w, x, y, z);
Amuk=*Amu * vis_flux_init_c_d;
Tx=Tk* *Akx + Ti* *Aix + Ts* *Asx;
Ty=Tk* *Aky + Ti* *Aiy + Ts* *Asy;
Tz=Tk* *Akz + Ti* *Aiz + Ts* *Asz;
*E1=u* *s11 + v* *s12 + w* *s13 + Amuk*Tx;
*E2=u* *s12 + v* *s22 + w* *s23 + Amuk*Ty;
*E3=u* *s13 + v* *s23 + w* *s33 + Amuk*Tz;
}
__device__ void vis_flus_ev_ker(
vis_flux vf,
REAL *s11,
REAL *s12,
REAL *s13,
REAL *s22,
REAL *s23,
REAL *s33,
REAL *E1,
REAL *E2,
REAL *E3,
cudaField Ev1,
cudaField Ev2,
cudaField Ev3,
cudaField Ev4,
int x,
int y,
int z
){
REAL akx , aky , akz;
{
REAL Aj1;
Aj1 = get_Field_LAP(vf.Ajac , x,y,z);
akx = get_Field_LAP(vf.Ax, x, y, z)*Aj1;
aky = get_Field_LAP(vf.Ay, x, y, z)*Aj1;
akz = get_Field_LAP(vf.Az, x, y, z)*Aj1;
}
get_Field_LAP(Ev1, x, y, z) = ( akx* *s11 + aky* *s12 + akz* *s13 );
get_Field_LAP(Ev2, x, y, z) = ( akx* *s12 + aky* *s22 + akz* *s23 );
get_Field_LAP(Ev3, x, y, z) = ( akx* *s13 + aky* *s23 + akz* *s33 );
get_Field_LAP(Ev4, x, y, z) = ( akx* *E1 + aky* *E2 + akz* *E3 );
}
__global__ void vis_flux_ker(
vis_flux vf,
cudaField Ev1,
cudaField Ev2,
cudaField Ev3,
cudaField Ev4,
cudaJobPackage job)
{
// eyes on cells WITH LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if( x<job.end.x && y<job.end.y && z<job.end.z){
REAL s11, s12, s13, s22, s23, s33;
REAL E1, E2, E3;
REAL Akx = get_Field_LAP(vf.Akx, x, y, z);
REAL Aix = get_Field_LAP(vf.Aix, x, y, z);
REAL Asx = get_Field_LAP(vf.Asx, x, y, z);
REAL Aky = get_Field_LAP(vf.Aky, x, y, z);
REAL Aiy = get_Field_LAP(vf.Aiy, x, y, z);
REAL Asy = get_Field_LAP(vf.Asy, x, y, z);
REAL Akz = get_Field_LAP(vf.Akz, x, y, z);
REAL Aiz = get_Field_LAP(vf.Aiz, x, y, z);
REAL Asz = get_Field_LAP(vf.Asz, x, y, z);
REAL Amu = get_Field(vf.Amu, x-LAP, y-LAP, z-LAP);
vis_flux_s_ker(vf,&Akx,&Aix,&Asx,&Aky,&Aiy,&Asy,&Akz,&Aiz,&Asz,&Amu,&s11,&s12,&s13,&s22,&s23,&s33,x,y,z);
vis_flux_e_ker(vf,&Amu,&Akx,&Aky,&Akz,&Aix,&Aiy,&Aiz,&Asx,&Asy,&Asz,
&s11,&s12,&s13,&s22,&s23,&s33,&E1,&E2,&E3,x,y,z);
vis_flus_ev_ker(vf,&s11,&s12,&s13,&s22,&s23,&s33,&E1,&E2,&E3,
Ev1,Ev2,Ev3,Ev4,x,y,z);
}
}
void du_viscous_Jacobian3d_x_init(cudaStream_t *stream){
dim3 blockdim , griddim;
uint32_t BlockDimX1 = 8;
uint32_t BlockDimY1 = 4;
uint32_t BlockDimZ1 = 4;
cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
*pTk_d,*pTi_d,*pTs_d,*pAmu_d,
*pu_d,*pv_d,*pw_d,*pAkx_d,*pAky_d,*pAkz_d,
*pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
CUDA_LAUNCH(( vis_flux_ker<<<griddim , blockdim, 0, *stream>>>(vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
}
void du_viscous_Jacobian3d_x_final(cudaJobPackage job_in, cudaStream_t *stream){
dim3 blockdim , griddim, size;
jobsize(&job_in, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
OCFD_dx0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dx0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dx0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
OCFD_dx0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[0], D0_bound[1]);
cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) ,
dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
int size_du = pdu_d->pitch*ny*nz;
cudaField tmp_du;
tmp_du.pitch = pdu_d->pitch;
tmp_du.ptr = pdu_d->ptr + size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_u_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_v_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_w_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_T_d, *pAjac_d, job) ));
}
void du_viscous_Jacobian3d_y_init(cudaStream_t *stream){
dim3 blockdim , griddim;
uint32_t BlockDimX1 = 8;
uint32_t BlockDimY1 = 4;
uint32_t BlockDimZ1 = 4;
cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
*pTk_d,*pTi_d,*pTs_d,*pAmu_d,
*pu_d,*pv_d,*pw_d,*pAix_d,*pAiy_d,*pAiz_d,
*pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
CUDA_LAUNCH(( vis_flux_ker<<<griddim , blockdim, 0, *stream>>>(vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
}
void du_viscous_Jacobian3d_y_final(cudaJobPackage job_in, cudaStream_t *stream){
dim3 blockdim , griddim, size;
jobsize(&job_in, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
OCFD_dy0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dy0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dy0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
OCFD_dy0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[2], D0_bound[3]);
cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) ,
dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
int size_du = pdu_d->pitch*ny*nz;
cudaField tmp_du;
tmp_du.pitch = pdu_d->pitch;
tmp_du.ptr = pdu_d->ptr + size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_u_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_v_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_w_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_T_d, *pAjac_d, job) ));
}
void du_viscous_Jacobian3d_z_init(cudaStream_t *stream){
dim3 blockdim , griddim;
uint32_t BlockDimX1 = 8;
uint32_t BlockDimY1 = 4;
uint32_t BlockDimZ1 = 4;
cal_grid_block_dim(&griddim, &blockdim, BlockDimX1, BlockDimY1, BlockDimZ1, nx, ny, nz);
cudaJobPackage job( dim3(LAP, LAP, LAP) , dim3(nx_lap, ny_lap, nz_lap) );
vis_flux vis_flux_parameter = {*puk_d,*pvk_d,*pwk_d,*pui_d,*pvi_d,*pwi_d,*pus_d,*pvs_d,*pws_d,
*pTk_d,*pTi_d,*pTs_d,*pAmu_d,
*pu_d,*pv_d,*pw_d,*pAsx_d,*pAsy_d,*pAsz_d,
*pAjac_d,*pAkx_d,*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d};
CUDA_LAUNCH(( vis_flux_ker<<<griddim , blockdim, 0, *stream>>>(vis_flux_parameter, *pEv1_d, *pEv2_d, *pEv3_d, *pEv4_d, job) ));
}
void du_viscous_Jacobian3d_z_final(cudaJobPackage job_in, cudaStream_t *stream){
dim3 blockdim , griddim, size;
jobsize(&job_in, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
OCFD_dz0(*pEv1_d, *vis_u_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
OCFD_dz0(*pEv2_d, *vis_v_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
OCFD_dz0(*pEv3_d, *vis_w_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
OCFD_dz0(*pEv4_d, *vis_T_d, job_in, BlockDim, stream, D0_bound[4], D0_bound[5]);
cudaJobPackage job(dim3(job_in.start.x-LAP, job_in.start.y-LAP, job_in.start.z-LAP) ,
dim3(job_in.end.x - LAP, job_in.end.y - LAP, job_in.end.z - LAP));
int size_du = pdu_d->pitch*ny*nz;
cudaField tmp_du;
tmp_du.pitch = pdu_d->pitch;
tmp_du.ptr = pdu_d->ptr + size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_u_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_v_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_w_d, *pAjac_d, job) ));
tmp_du.ptr += size_du;
CUDA_LAUNCH(( YF_Pe_XF<<<griddim , blockdim, 0, *stream>>>(tmp_du, *vis_T_d, *pAjac_d, job) ));
}
__global__ void boundary_symmetry_pole_vis_y_ker_m(
cudaField Ev1,
cudaField Ev2,
cudaField Ev3,
cudaField Ev4,
cudaJobPackage job){
// eyes on Bottom holo cells WITH LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if( x<job.end.x && y<job.end.y && z<job.end.z){
unsigned int y1 = 2*LAP - y;
get_Field_LAP(Ev1 , x,y,z) = - get_Field_LAP(Ev1 , x,y1,z);
get_Field_LAP(Ev2 , x,y,z) = get_Field_LAP(Ev2 , x,y1,z);
get_Field_LAP(Ev3 , x,y,z) = - get_Field_LAP(Ev3 , x,y1,z);
get_Field_LAP(Ev4 , x,y,z) = - get_Field_LAP(Ev4 , x,y1,z);
}
}
__global__ void boundary_symmetry_pole_vis_y_ker_p(
cudaField Ev1,
cudaField Ev2,
cudaField Ev3,
cudaField Ev4,
cudaJobPackage job){
// eyes on Top holo cells WITH LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if( x<job.end.x && y<job.end.y && z<job.end.z){
unsigned int y1 = 2*(ny_d+LAP-1) - y;
get_Field_LAP(Ev1 , x,y,z) = - get_Field_LAP(Ev1 , x,y1,z);
get_Field_LAP(Ev2 , x,y,z) = get_Field_LAP(Ev2 , x,y1,z);
get_Field_LAP(Ev3 , x,y,z) = - get_Field_LAP(Ev3 , x,y1,z);
get_Field_LAP(Ev4 , x,y,z) = - get_Field_LAP(Ev4 , x,y1,z);
}
}
void boundary_symmetry_pole_vis_y(cudaStream_t *stream){
dim3 blockdim , griddim;
// symmetry or pole boundary condition for viscous term
if(IF_SYMMETRY == 1){
if(npy == 0){
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , LAP , BlockDimZ , nx , LAP , nz);
cudaJobPackage job(dim3(LAP , 0 , LAP) , dim3(nx_lap , LAP , nz_lap));
CUDA_LAUNCH(( boundary_symmetry_pole_vis_y_ker_m<<<griddim , blockdim, 0, *stream>>>(*pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d , job) ));
}
if(npy == NPY0-1){
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , LAP , BlockDimZ , nx , LAP , nz);
cudaJobPackage job(dim3(LAP , ny_lap , LAP) , dim3(nx_lap , ny_2lap , nz_lap));
CUDA_LAUNCH(( boundary_symmetry_pole_vis_y_ker_p<<<griddim , blockdim, 0, *stream>>>(*pEv1_d,*pEv2_d,*pEv3_d,*pEv4_d , job) ));
}
}
}
#ifdef __cplusplus
}
#endif
#ifndef __NS_SOLVER_C
#define __NS_SOLVER_C
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <netdb.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <math.h>
#include "mpi.h"
#include "parameters.h"
#include "utility.h"
#include "OCFD_NS_Solver.h"
#include "OCFD_time.h"
#include "OCFD_mpi.h"
#include "OCFD_boundary.h"
#include "OCFD_IO.h"
#include "OCFD_init.h"
#include "OCFD_Stream.h"
#include "OCFD_filtering.h"
#include "OCFD_ana.h"
#include "OCFD_mpi_dev.h"
#include "parameters_d.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#include "commen_kernel.h"
#ifdef __cplusplus
extern "C"{
#endif
void NS_solver_real()
{
// initial of Amu, Amu_t ---
// Amu=0.d0;
// ----------------initial---------------------------------------------------------
exchange_boundary_xyz_packed_dev(pd , pd_d);
exchange_boundary_xyz_packed_dev(pu , pu_d);
exchange_boundary_xyz_packed_dev(pv , pv_d);
exchange_boundary_xyz_packed_dev(pw , pw_d);
exchange_boundary_xyz_packed_dev(pT , pT_d);
OCFD_bc();
get_Amu();
if (my_id == 0)
printf("init ok\n\n");
REAL wstart0 , wstart , wend;
wstart0 = MPI_Wtime();
wstart = wstart0;
// -----------------------------------------------------------------------
do
{
{
REAL * tmp = pf;
pf = pfn;
pfn = tmp;
tmp = pf_d->ptr;
pf_d->ptr = pfn_d->ptr;
pfn_d->ptr = tmp;
}
// 3-step Runge-Kutta
for (int KRK = 1; KRK <= 3; KRK++)
{
du_comput(KRK);
OCFD_time_advance(KRK);
get_duvwT();
OCFD_bc();
get_Amu();
}
Istep++;
tt += dt;
// ---Filtering -------------------------------
filtering(pf, pf_lap, pP);
//modify_NT();
for(int i = 0; i < N_ana; i++){
if(Istep % Kstep_ana[i] == 0) OCFD_ana(K_ana[i], i);
}
if(Istep % Kstep_show == 0){
MPI_Barrier(MPI_COMM_WORLD);
wend = MPI_Wtime();
if(TEST == 1){
char hostbuffer[100];
char *IPbuffer;
struct hostent *host_entry;
int hostname = gethostname(hostbuffer, sizeof(hostbuffer));
host_entry = gethostbyname(hostbuffer);
IPbuffer = inet_ntoa(*((struct in_addr*)
host_entry->h_addr_list[0]));
printf("Host name: %s; Host IP: %s; GPU time %lf\n" , hostbuffer, IPbuffer, wend - wstart);
exit(0);
}
REAL E0 = 0.;
cudaField E0_d;
E0_d.pitch = pf_d->pitch; E0_d.ptr = pf_d->ptr + 4 * pf_d->pitch*ny*nz;
ana_residual(E0_d, &E0);
if(isnan(E0)){
if(IFLAG_HybridAuto == 1) {
//HybridAuto_scheme_IO();
//MPI_Barrier(MPI_COMM_WORLD);
}
ana_NAN_and_NT();
}
REAL T0 = 0.;
cudaField T0_d;
T0_d.pitch = pdu_d->pitch; T0_d.ptr = pdu_d->ptr;
get_inner(T0_d, *pT_d);
ana_residual(T0_d, &T0);
if(my_id == 0){
printf("%lf of %lf ( \033[33m%d\033[0m of %d ) , using \033[36m%lf\033[0m\n", tt , end_time , Istep , end_step , wend - wstart0);
printf("%d steps GPU time %lf\n" ,Kstep_show , wend - wstart);
printf("Averaged Total Energy is %lf\n", E0);
printf("Averaged Total T is %lf\n", T0);
printf("\n");
}
wstart = MPI_Wtime();
}
// -----------save data---------------------------------------------
if(Istep%Kstep_save == 0){
memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
OCFD_save(0, Istep , pd , pu , pv , pw , pT);
}
if(end_time <= 0.0) break; //end_time .le. 0 means that stop computation just after saving files
} while (tt < end_time);
// --------------------------------------------------------------------------------------
MPI_Barrier(MPI_COMM_WORLD);
if (my_id == 0)
{
wend = MPI_Wtime();
printf("OK! opencfd is finished\n");
printf("Total GPU time %lf\n" , wend - wstart0);
}
}
#ifdef __cplusplus
}
#endif
#endif
#include <math.h>
#include "parameters.h"
#include "utility.h"
#include "OCFD_Schemes.h"
#include "OCFD_bound_Scheme.h"
#include "parameters_d.h"
#include "OCFD_warp_shuffle.h"
#include "cuda_utility.h"
#ifdef __cplusplus
extern "C"{
#endif
__device__ int get_data0_kernel(int flagxyz, dim3 *coords, cudaField pf, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
int offset = job.start.x + pf.pitch*(job.start.y + ny_2lap_d*job.start.z);
switch(flagxyz){
case 1:
case 4:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = get_Field_LAP(pf, x+i, y, z, offset);
}
return 1;
}
}
break;
case 2:
case 5:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = get_Field_LAP(pf, x, y+i, z, offset);
}
return 2;
}
}
break;
case 3:
case 6:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int z = coords->z = blockDim.y * blockIdx.y + threadIdx.y;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = get_Field_LAP(pf, x, y, z+i, offset);
}
return 3;
}
}
break;
}
return 0;
}
__device__ void put_d0_kernel(dim3 flagxyz, dim3 coords, REAL tmp, cudaField pfy, cudaJobPackage job){
unsigned int x = coords.x + job.start.x;
unsigned int y = coords.y + job.start.y;
unsigned int z = coords.z + job.start.z;
switch(flagxyz.x){
case 1:
case 4:
get_Field(pfy, x-LAP, y-LAP, z-LAP) = tmp/hx_d;
break;
case 2:
case 5:
get_Field(pfy, x-LAP, y-LAP, z-LAP) = tmp/hy_d;
break;
case 3:
case 6:
get_Field(pfy, x-LAP, y-LAP, z-LAP) = tmp/hz_d;
break;
}
}
__device__ REAL OCFD_kernel_CD6(REAL *stencil){
REAL tmp = (45.0*(stencil[4] - stencil[2])
-9.0*(stencil[5] - stencil[1])
+(stencil[6] - stencil[0]))/60.0;
return tmp;
}
__global__ void OCFD_CD6_kernel(dim3 flagxyzb, cudaField pf, cudaField pfy, cudaJobPackage job){
dim3 coords;
REAL stencil[7], tmp;
int ia1 = -3; int ib1 = 3;
int flag = get_data0_kernel(flagxyzb.x, &coords, pf, &stencil[0], ia1, ib1, job);
if(flag != 0){
flag = OCFD_D0bound_scheme_kernel(&tmp, flagxyzb, coords, &stencil[0], ia1, job);
if(flag != 0) tmp = OCFD_kernel_CD6(&stencil[0]);
put_d0_kernel(flagxyzb, coords, tmp, pfy, job);
}
}
__device__ REAL OCFD_kernel_CD8(REAL *stencil){
REAL tmp = (672.0*(stencil[5] - stencil[3])
-168.0*(stencil[6] - stencil[2])
+32.0*(stencil[7] - stencil[1])
-3.0*(stencil[8] - stencil[0]))/840.0;
return tmp;
}
__global__ void OCFD_CD8_kernel(dim3 flagxyzb, cudaField pf, cudaField pfy, cudaJobPackage job){
dim3 coords;
REAL stencil[9], tmp;
int ia1 = -4; int ib1 = 4;
int flag = get_data0_kernel(flagxyzb.x, &coords, pf, &stencil[0], ia1, ib1, job);
if(flag != 0){
flag = OCFD_D0bound_scheme_kernel(&tmp, flagxyzb, coords, &stencil[0], ia1, job);
if(flag != 0) tmp = OCFD_kernel_CD8(&stencil[0]);
put_d0_kernel(flagxyzb, coords, tmp, pfy, job);
}
}
__device__ int get_data_kernel(int flagxyz, dim3 *coords, cudaSoA f, int num, REAL *stencil, int ka1, int kb1, REAL *sort, cudaJobPackage job){
int offset = job.start.x + f.pitch*(job.start.y + ny_2lap_d*job.start.z);
switch(flagxyz){
case 1:
case 4:
{
unsigned int x = coords->x = (blockDim.x-1) * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)){
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = get_SoA_LAP(f, x+i, y, z, num, offset);
}
return 1;
}
}
break;
case 2:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.y;
unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
sort[ID1] = get_SoA_LAP(f, x, y-LAP+1, z, num, offset);
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y-1) && z < (job.end.z-job.start.z))
sort[ID1+8] = get_SoA_LAP(f, x, y+LAP+1, z, num, offset);
__syncthreads();
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = sort[ID2+i+3];
}
x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.x;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 2;
}
break;
case 5:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.y;
unsigned int z = coords->z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
sort[ID1] = get_SoA_LAP(f, x, y-LAP, z, num, offset);
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y-1) && z < (job.end.z-job.start.z))
sort[ID1+8] = get_SoA_LAP(f, x, y+LAP, z, num, offset);
__syncthreads();
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = sort[ID2+i+LAP];
}
x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
y = coords->y = (blockDim.y-1) * blockIdx.y + threadIdx.x;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 2;
}
break;
case 3:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.y;
unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
sort[ID1] = get_SoA_LAP(f, x, y, z-LAP+1, num, offset);
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z-1))
sort[ID1+8] = get_SoA_LAP(f, x, y, z+LAP+1, num, offset);
__syncthreads();
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = sort[ID2+i+3];
}
x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.x;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 3;
}
break;
case 6:
{
unsigned int x = coords->x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = coords->y = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.y;
unsigned int ID1 = 128*threadIdx.z + 16*threadIdx.x + threadIdx.y;
unsigned int ID2 = 128*threadIdx.z + 16*threadIdx.y + threadIdx.x;
sort[ID1] = get_SoA_LAP(f, x, y, z-LAP, num, offset);
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z-1))
sort[ID1+8] = get_SoA_LAP(f, x, y, z+LAP, num, offset);
__syncthreads();
for(int i = ka1; i <= kb1; i++){
stencil[i-ka1] = sort[ID2+i+LAP];
}
x = coords->x = blockDim.x * blockIdx.x + threadIdx.y;
z = coords->z = (blockDim.y-1) * blockIdx.y + threadIdx.x;
if(x < (job.end.x-job.start.x) && y < (job.end.y-job.start.y) && z < (job.end.z-job.start.z)) return 3;
}
break;
}
return 0;
}
__device__ void put_du_p_kernel(dim3 flagxyz, dim3 coords, REAL tmp_r, REAL tmp_l, cudaSoA du, int num, cudaField Ajac, cudaJobPackage job){
unsigned int x = coords.x + job.start.x;
unsigned int y = coords.y + job.start.y;
unsigned int z = coords.z + job.start.z;
switch(flagxyz.x){
case 1:
case 4:
if(flagxyz.x == 1 && flagxyz.z == 1 && coords.x == 1){
//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += 0;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
}else{
//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hx_d;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hx_d);
}
//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hx_d;
break;
case 2:
case 5:
if(flagxyz.x == 2 && flagxyz.z == 1 && coords.y == 1){
//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += 0;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
}else{
//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hy_d;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hy_d);
}
//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hy_d;
break;
case 3:
case 6:
if(flagxyz.x == 3 && flagxyz.z == 1 && coords.z == 1){
//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += 0;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
}else{
//get_SoA(du, x-LAP, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hz_d;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z)*(tmp_r - tmp_l)/hz_d);
}
//get_Field(Ajac, x-LAP, y-LAP, z-LAP) = (tmp_r - tmp_l)/hz_d;
break;
}
}
__device__ void put_du_m_kernel(dim3 flagxyz, dim3 coords, REAL tmp_r, REAL tmp_l, cudaSoA du, int num, cudaField Ajac, cudaJobPackage job){
unsigned int x = coords.x + job.start.x;
unsigned int y = coords.y + job.start.y;
unsigned int z = coords.z + job.start.z;
switch(flagxyz.x){
case 1:
case 4:
if(flagxyz.x == 4 && flagxyz.z == 1 && coords.x == job.end.x-job.start.x-1){
//get_SoA(du, x-LAP-1, y-LAP, z-LAP, num) += 0;
atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), 0);
}else{
//get_SoA(du, x-LAP-1, y-LAP, z-LAP, num) += -get_Field_LAP(Ajac, x-1, y, z)*(tmp_r - tmp_l)/hx_d;
atomicAdd(du.ptr + (x - LAP - 1 + du.pitch*(y - LAP + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x-1, y, z)*(tmp_r - tmp_l)/hx_d);
}
//get_Field(Ajac, x-LAP-1, y-LAP, z-LAP) = (tmp_r - tmp_l)/hx_d;
break;
case 2:
case 5:
if(flagxyz.x == 5 && flagxyz.z == 1 && coords.y == job.end.y-job.start.y-1){
//get_SoA(du, x-LAP, y-LAP-1, z-LAP, num) += 0;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), 0);
}else{
//get_SoA(du, x-LAP, y-LAP-1, z-LAP, num) += -get_Field_LAP(Ajac, x, y-1, z)*(tmp_r - tmp_l)/hy_d;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP - 1 + ny_d*(z - LAP + (num)*nz_d))), -get_Field_LAP(Ajac, x, y-1, z)*(tmp_r - tmp_l)/hy_d);
}
//get_Field(Ajac, x-LAP, y-LAP-1, z-LAP) = (tmp_r - tmp_l)/hy_d;
break;
case 3:
case 6:
if(flagxyz.x == 6 && flagxyz.z == 1 && coords.z == job.end.z-job.start.z-1){
//get_SoA(du, x-LAP, y-LAP, z-LAP-1, num) += 0;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), 0);
}else{
//get_SoA(du, x-LAP, y-LAP, z-LAP-1, num) += -get_Field_LAP(Ajac, x, y, z-1)*(tmp_r - tmp_l)/hz_d;
atomicAdd(du.ptr + (x - LAP + du.pitch*(y - LAP + ny_d*(z - LAP - 1 + (num)*nz_d))), -get_Field_LAP(Ajac, x, y, z-1)*(tmp_r - tmp_l)/hz_d);
}
//get_Field(Ajac, x-LAP, y-LAP, z-LAP-1) = (tmp_r - tmp_l)/hz_d;
break;
}
}
// =================================================================================================================================== //
//---------------------------------------------------------WENO7_SYMBO_P-------------------------------------------------------------//
// 7th order WENO-SYMBO scheme (Bandwith-Optimized Symmetric WENO scheme), see in Martin et al, J. Comput. Phys. 220, 270-289
// -----The difference between WENO-SYMOO and WENO-SYMBO is that the coefficients of is different, other is the same.
__device__ REAL OCFD_weno7_SYMBO_kernel_P(int WENO_LMT_FLAG, REAL *stencil){
REAL S0, S1, S2, S3, S4;
REAL tmp, tmp1, TVmin = 0, TVmax = 1;
if(WENO_LMT_FLAG == 1){
S0 = fabs(stencil[1] - stencil[0]) + fabs(stencil[2] - stencil[1]) + fabs(stencil[3] - stencil[2]);
S1 = fabs(stencil[2] - stencil[1]) + fabs(stencil[3] - stencil[2]) + fabs(stencil[4] - stencil[3]);
S2 = fabs(stencil[3] - stencil[2]) + fabs(stencil[4] - stencil[3]) + fabs(stencil[5] - stencil[4]);
S3 = fabs(stencil[4] - stencil[3]) + fabs(stencil[5] - stencil[4]) + fabs(stencil[6] - stencil[5]);
S4 = fabs(stencil[5] - stencil[4]) + fabs(stencil[6] - stencil[5]) + fabs(stencil[7] - stencil[6]);
tmp = fmin(S0,S1);
tmp1 = fmin(S2,S3);
tmp = fmin(tmp,tmp1);
TVmin = fmin(tmp ,S4);
tmp = fmax(S0,S1);
tmp1 = fmax(S2,S3);
tmp = fmax(tmp,tmp1);
TVmax = fmax(tmp ,S4);
}
if(TVmax < WENO_TV_Limiter_d*TVmin && TVmax < WENO_TV_MAX_d){
S0 = 0.0401954833730;
S1 = 0.2493800006710;
S2 = 0.4802686256260;
S3 = 0.2009775476730;
S4 = 0.0291783426580;
}else{
S0 = 0.0; S1 = 0.0; S2 = 0.0; S3 = 0.0; S4 =0.0;
// 1st
tmp = -2.0*stencil[0] + 9.0*stencil[1] - 18.0*stencil[2] + 11.0*stencil[3]; S0 += 720.0*tmp*tmp;
tmp = stencil[1] - 6.0*stencil[2] + 3.0*stencil[3] + 2.0*stencil[4]; S1 += 720.0*tmp*tmp;
tmp = -2.0*stencil[2] - 3.0*stencil[3] + 6.0*stencil[4] - stencil[5]; S2 += 720.0*tmp*tmp;
tmp = -11.0*stencil[3] + 18.0*stencil[4] - 9.0*stencil[5] + 2.0*stencil[6]; S3 += 720.0*tmp*tmp;
tmp = -26.0*stencil[4] + 57.0*stencil[5] - 42.0*stencil[6] + 11.0*stencil[7]; S4 += 720.0*tmp*tmp;
// 2nd
tmp = -6.0*stencil[0] + 24.0*stencil[1] - 30.0*stencil[2] + 12.0*stencil[3]; S0 += 780.0*tmp*tmp;
tmp = 6.0*stencil[2] - 12.0*stencil[3] + 6.0*stencil[4]; S1 += 780.0*tmp*tmp;
tmp = 6.0*stencil[3] - 12.0*stencil[4] + 6.0*stencil[5]; S2 += 780.0*tmp*tmp;
tmp = 12.0*stencil[3] - 30.0*stencil[4] + 24.0*stencil[5] - 6.0*stencil[6]; S3 += 780.0*tmp*tmp;
tmp = 18.0*stencil[4] - 48.0*stencil[5] + 42.0*stencil[6] - 12.0*stencil[7]; S4 += 780.0*tmp*tmp;
// 3rd
tmp = -6.0*stencil[0] + 18.0*( stencil[1] - stencil[2] ) + 6.0*stencil[3]; S0 += 781.0*tmp*tmp;
tmp = -6.0*stencil[1] + 18.0*( stencil[2] - stencil[3] ) + 6.0*stencil[4]; S1 += 781.0*tmp*tmp;
tmp = -6.0*stencil[2] + 18.0*( stencil[3] - stencil[4] ) + 6.0*stencil[5]; S2 += 781.0*tmp*tmp;
tmp = -6.0*stencil[3] + 18.0*( stencil[4] - stencil[5] ) + 6.0*stencil[6]; S3 += 781.0*tmp*tmp;
tmp = -6.0*stencil[4] + 18.0*( stencil[5] - stencil[6] ) + 6.0*stencil[7]; S4 += 781.0*tmp*tmp;
{
tmp = fmax(S0,S1);
tmp1 = fmax(S2,S3);
tmp = fmax(tmp,tmp1);
S4 = fmax(tmp ,S4);
}
{
REAL tmp2, tmp3;
tmp = (0.0401954833730)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
tmp1 = (0.2493800006710)*(2.592e-6+S0)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
tmp2 = (0.4802686256260)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S3)*(2.592e-6+S4);
tmp3 = (0.2009775476730)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S4);
S4 = (0.0291783426580)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3);
S3 = tmp3;
S2 = tmp2;
S1 = tmp1;
S0 = tmp;
}
}
REAL am=S0+S1+S2+S3+S4;
tmp1 = 0.0;
tmp = -3.0*stencil[0] + 13.0*stencil[1] - 23.0*stencil[2] + 25.0*stencil[3]; tmp1 += S0*tmp;
tmp = stencil[1] - 5.0*stencil[2] + 13.0*stencil[3] + 3.0*stencil[4]; tmp1 += S1*tmp;
tmp = -stencil[2] + 7.0*stencil[3] + 7.0*stencil[4] - stencil[5]; tmp1 += S2*tmp;
tmp = 3.0*stencil[3] + 13.0*stencil[4] - 5.0*stencil[5] + stencil[6]; tmp1 += S3*tmp;
tmp = 25.0*stencil[4] - 23.0*stencil[5] + 13.0*stencil[6] - 3.0*stencil[7]; tmp1 += S4*tmp;
tmp1 /= (12.0*am);
return tmp1;
}
__global__ void OCFD_weno7_SYMBO_P_kernel(int i, int WENO_LMT_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int ia1 = -3; int ib1 = 4;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_weno7_SYMBO_kernel_P(WENO_LMT_FLAG, &stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//=====================================================================================================================================================//
//----------------------------------------------------------------WENO7_SYMBO_M------------------------------------------------------------------------//
__device__ REAL OCFD_weno7_SYMBO_kernel_M(int WENO_LMT_FLAG, REAL *stencil){
REAL S0, S1, S2, S3, S4;
REAL tmp, tmp1, TVmin = 0, TVmax = 1;
if(WENO_LMT_FLAG == 1){
S0 = fabs(stencil[7] - stencil[6]) + fabs(stencil[6] - stencil[5]) + fabs(stencil[5] - stencil[4]);
S1 = fabs(stencil[6] - stencil[5]) + fabs(stencil[5] - stencil[4]) + fabs(stencil[4] - stencil[3]);
S2 = fabs(stencil[5] - stencil[4]) + fabs(stencil[4] - stencil[3]) + fabs(stencil[3] - stencil[2]);
S3 = fabs(stencil[4] - stencil[3]) + fabs(stencil[3] - stencil[2]) + fabs(stencil[2] - stencil[1]);
S4 = fabs(stencil[3] - stencil[2]) + fabs(stencil[2] - stencil[1]) + fabs(stencil[1] - stencil[0]);
tmp = fmin(S0,S1);
tmp1 = fmin(S2,S3);
tmp = fmax(tmp,tmp1);
TVmax = fmax(tmp ,S4);
}
if(TVmax < WENO_TV_Limiter_d*TVmin && TVmax < WENO_TV_MAX_d){
S0 = 0.0401954833730;
S1 = 0.2493800006710;
S2 = 0.4802686256260;
S3 = 0.2009775476730;
S4 = 0.0291783426580;
}else{
S0 = 0.0; S1 = 0.0; S2 = 0.0; S3 = 0.0; S4 =0.0;
// 1st
tmp = -2.0*stencil[7] + 9.0*stencil[6] - 18.0*stencil[5] + 11.0*stencil[4]; S0 += 720.0*tmp*tmp;
tmp = stencil[6] - 6.0*stencil[5] + 3.0*stencil[4] + 2.0*stencil[3]; S1 += 720.0*tmp*tmp;
tmp = -2.0*stencil[5] - 3.0*stencil[4] + 6.0*stencil[3] - stencil[2]; S2 += 720.0*tmp*tmp;
tmp = -11.0*stencil[4] + 18.0*stencil[3] - 9.0*stencil[2] + 2.0*stencil[1]; S3 += 720.0*tmp*tmp;
tmp = -26.0*stencil[3] + 57.0*stencil[2] - 42.0*stencil[1] + 11.0*stencil[0]; S4 += 720.0*tmp*tmp;
// 2nd
tmp = -6.0*stencil[7] + 24.0*stencil[6] - 30.0*stencil[5] + 12.0*stencil[4]; S0 += 780.0*tmp*tmp;
tmp = 6.0*stencil[5] - 12.0*stencil[4] + 6.0*stencil[3]; S1 += 780.0*tmp*tmp;
tmp = 6.0*stencil[4] - 12.0*stencil[3] + 6.0*stencil[2]; S2 += 780.0*tmp*tmp;
tmp = 12.0*stencil[4] - 30.0*stencil[3] + 24.0*stencil[2] - 6.0*stencil[1]; S3 += 780.0*tmp*tmp;
tmp = 18.0*stencil[3] - 48.0*stencil[2] + 42.0*stencil[1] - 12.0*stencil[0]; S4 += 780.0*tmp*tmp;
// 3rd
tmp = -6.0*stencil[7] + 18.0*( stencil[6] - stencil[5] ) + 6.0*stencil[4]; S0 += 781.0*tmp*tmp;
tmp = -6.0*stencil[6] + 18.0*( stencil[5] - stencil[4] ) + 6.0*stencil[3]; S1 += 781.0*tmp*tmp;
tmp = -6.0*stencil[5] + 18.0*( stencil[4] - stencil[3] ) + 6.0*stencil[2]; S2 += 781.0*tmp*tmp;
tmp = -6.0*stencil[4] + 18.0*( stencil[3] - stencil[2] ) + 6.0*stencil[1]; S3 += 781.0*tmp*tmp;
tmp = -6.0*stencil[3] + 18.0*( stencil[2] - stencil[1] ) + 6.0*stencil[0]; S4 += 781.0*tmp*tmp;
{
tmp = fmax(S0,S1);
tmp1 = fmax(S2,S3);
tmp = fmax(tmp,tmp1);
S4 = fmax(tmp ,S4);
}
{
REAL tmp2, tmp3;
tmp = (0.0401954833730)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
tmp1 = (0.2493800006710)*(2.592e-6+S0)*(2.592e-6+S2)*(2.592e-6+S3)*(2.592e-6+S4);
tmp2 = (0.4802686256260)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S3)*(2.592e-6+S4);
tmp3 = (0.2009775476730)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S4);
S4 = (0.0291783426580)*(2.592e-6+S0)*(2.592e-6+S1)*(2.592e-6+S2)*(2.592e-6+S3);
S3 = tmp3;
S2 = tmp2;
S1 = tmp1;
S0 = tmp;
}
}
REAL am=S0+S1+S2+S3+S4;
tmp1 = 0.0;
tmp = -3.0*stencil[7] + 13.0*stencil[6] - 23.0*stencil[5] + 25.0*stencil[4]; tmp1 += S0*tmp;
tmp = stencil[6] - 5.0*stencil[5] + 13.0*stencil[4] + 3.0*stencil[3]; tmp1 += S1*tmp;
tmp = - stencil[5] + 7.0*stencil[4] + 7.0*stencil[3] - stencil[2]; tmp1 += S2*tmp;
tmp = 3.0*stencil[4] + 13.0*stencil[3] - 5.0*stencil[2] + stencil[1]; tmp1 += S3*tmp;
tmp = 25.0*stencil[3] - 23.0*stencil[2] + 13.0*stencil[1] - 3.0*stencil[0]; tmp1 += S4*tmp;
tmp1 /= (12.0*am);
return tmp1;
}
__global__ void OCFD_weno7_SYMBO_M_kernel(int i, int WENO_LMT_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
__syncthreads();
dim3 coords;
REAL stencil[8];
int ia1 = -4; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r = 0.0, tmp_l = 0.0;
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_weno7_SYMBO_kernel_M(WENO_LMT_FLAG, &stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//=======================================================================================================================================================//
//---------------------------------------------------------------------WENO7_P---------------------------------------------------------------------------//
__device__ REAL OCFD_weno7_kernel_P(REAL *stencil){
REAL S0 =0.0, S1 =0.0, S2 =0.0, S3 =0.0;
REAL tmp, tmp1, tmp2;
tmp = -2.0*stencil[0] + 9.0*stencil[1] - 18.0*stencil[2] + 11.0*stencil[3]; S0 += 960*tmp*tmp;
tmp1 = -6.0*stencil[0] + 24.0*stencil[1] - 30.0*stencil[2] + 12.0*stencil[3]; S0 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[0] + 18.0*( stencil[1] - stencil[2] ) + 6.0*stencil[3]; S0 += (1043.0)*tmp2*tmp2;
tmp = tmp * tmp2; S0 += 80.0*tmp;
tmp = stencil[1] - 6.0*stencil[2] + 3.0*stencil[3] + 2.0*stencil[4]; S1 += 960*tmp*tmp;
tmp1 = 6.0*stencil[2] - 12.0*stencil[3] + 6.0*stencil[4]; S1 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[1] + 18.0*( stencil[2] - stencil[3] ) + 6.0*stencil[4]; S1 += (1043.0)*tmp2*tmp2;
tmp = tmp * tmp2; S1 += 80.0*tmp;
tmp = -2.0*stencil[2] - 3.0*stencil[3] + 6.0*stencil[4] - stencil[5]; S2 += 960*tmp*tmp;
tmp1 = 6.0*stencil[3] - 12.0*stencil[4] + 6.0*stencil[5]; S2 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[2] + 18.0*(stencil[3] - stencil[4]) + 6.0*stencil[5]; S2 += (1043.0)*tmp2*tmp2;
tmp = tmp * tmp2; S2 += 80.0*tmp;
tmp = -11.0*stencil[3] + 18.0*stencil[4] - 9.0*stencil[5] + 2.0*stencil[6]; S3 += 960*tmp*tmp;
tmp1 = 12.0*stencil[3] - 30.0*stencil[4] + 24.0*stencil[5] - 6.0*stencil[6]; S3 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[3] + 18.0*( stencil[4] - stencil[5] ) + 6.0*stencil[6]; S3 += (1043.0)*tmp2*tmp2;
tmp = tmp * tmp2; S3 += 80.0*tmp;
tmp = ((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
tmp1 = (12.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
tmp2 = (18.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S3)*(3.456e-4+S3));
S3 = (4.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2));
S2 = tmp2;
S1 = tmp1;
S0 = tmp;
REAL am=S0+S1+S2+S3;
tmp1 = 0.0;
tmp = -3.0*stencil[0] + 13.0*stencil[1] - 23.0*stencil[2] + 25.0*stencil[3]; tmp1 += S0*tmp;
tmp = stencil[1] - 5.0*stencil[2] + 13.0*stencil[3] + 3.0*stencil[4]; tmp1 += S1*tmp;
tmp = -stencil[2] + 7.0*stencil[3] + 7.0*stencil[4] - stencil[5]; tmp1 += S2*tmp;
tmp = 3.0*stencil[3] + 13.0*stencil[4] - 5.0*stencil[5] + stencil[6]; tmp1 += S3*tmp;
tmp1 /= (12.0*am);
return tmp1;
}
__global__ void OCFD_weno7_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[7];
int ia1 = -3; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//==============================================================================================================================================================//
//------------------------------------------------------------------------WENO7_M-------------------------------------------------------------------------------//
__device__ REAL OCFD_weno7_kernel_M(REAL *stencil){
REAL S0 =0.0, S1 =0.0, S2 =0.0, S3 =0.0;
REAL tmp, tmp1, tmp2;
tmp = -2.0*stencil[6] + 9.0*stencil[5] - 18.0*stencil[4] + 11.0*stencil[3]; S0 += 960*tmp*tmp;
tmp1 = -6.0*stencil[6] + 24.0*stencil[5] - 30.0*stencil[4] + 12.0*stencil[3]; S0 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[6] + 18.0*( stencil[5] - stencil[4] ) + 6.0*stencil[3]; S0 += (1043.0)*tmp2*tmp2;
tmp = tmp * tmp2; S0 += (80.0) * tmp;
tmp = stencil[5] - 6.0* stencil[4] + 3.0*stencil[3] + 2.0*stencil[2]; S1 += 960*tmp*tmp;
tmp1 = 6.0*stencil[4] - 12.0* stencil[3] + 6.0*stencil[2]; S1 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[5] + 18.0*( stencil[4] - stencil[3]) + 6.0*stencil[2]; S1 += (1043.0)*tmp2*tmp2;
tmp = tmp *tmp2; S1 += (80.0) * tmp;
tmp = -2.0*stencil[4] - 3.0*stencil[3] + 6.0*stencil[2] - stencil[1]; S2 += 960*tmp*tmp;
tmp1 = 6.0*stencil[3] - 12.0*stencil[2] + 6.0*stencil[1]; S2 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[4] + 18.0*(stencil[3] - stencil[2]) + 6.0*stencil[1]; S2 += (1043.0)*tmp2*tmp2;
tmp = tmp *tmp2; S2 += (80.0) * tmp;
tmp = -11.0*stencil[3] + 18.0*stencil[2] - 9.0*stencil[1] + 2.0*stencil[0]; S3 += 960*tmp*tmp;
tmp1 = 12.0*stencil[3] - 30.0*stencil[2] + 24.0*stencil[1] - 6.0*stencil[0]; S3 += (1040.0)*tmp1*tmp1;
tmp2 = -6.0*stencil[3] + 18.0*( stencil[2] - stencil[1] ) + 6.0*stencil[0]; S3 += (1043.0)*tmp2*tmp2;
tmp = tmp *tmp2; S3 += (80.0) * tmp;
tmp = ((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
tmp1 = (12.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S2)*(3.456e-4+S2))*((3.456e-4+S3)*(3.456e-4+S3));
tmp2 = (18.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S3)*(3.456e-4+S3));
S3 = (4.0)*((3.456e-4+S0)*(3.456e-4+S0))*((3.456e-4+S1)*(3.456e-4+S1))*((3.456e-4+S2)*(3.456e-4+S2));
S2 = tmp2;
S1 = tmp1;
S0 = tmp;
REAL am=S0+S1+S2+S3;
tmp1 = 0.0;
tmp = -3.0*stencil[6] + 13.0*stencil[5] - 23.0*stencil[4] + 25.0*stencil[3]; tmp1 += S0*tmp;
tmp = stencil[5] - 5.0*stencil[4] + 13.0*stencil[3] + 3.0*stencil[2]; tmp1 += S1*tmp;
tmp = -stencil[4] + 7.0*stencil[3] + 7.0*stencil[2] - stencil[1]; tmp1 += S2*tmp;
tmp = 3.0*stencil[3] + 13.0*stencil[2] - 5.0*stencil[1] + stencil[0]; tmp1 += S3*tmp;
tmp1 /= (12.0*am);
return tmp1;
}
__global__ void OCFD_weno7_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[7];
int ia1 = -3; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_weno7_kernel_M(&stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//==============================================================================================================================================================//
__device__ REAL sign(REAL x1, REAL x2){
if(x2 >=0){
x1 = fabs(x1);
}else{
x1 = -fabs(x1);
}
return x1;
}
__device__ REAL minmod2(REAL x1, REAL x2){
REAL minmod2 = 0.5*(sign(1.0, x1) + sign(1.0, x2))*fmin(fabs(x1),fabs(x2));
return minmod2;
}
__device__ REAL minmod4(REAL x1, REAL x2, REAL x3, REAL x4){
REAL minmod4 = 0.5*(sign(1.0, x1) + sign(1.0, x2));
minmod4 = minmod4*fabs(0.5*(sign(1.0, x1) + sign(1.0, x3)));
minmod4 = minmod4*fabs(0.5*(sign(1.0, x1) + sign(1.0, x4)));
REAL tmp = fmin(x1, x2);
REAL tmp1 = fmin(x3, x4);
tmp = fmin(tmp,tmp1);
minmod4 = minmod4*tmp;
return minmod4;
}
//===================================================================2order_NND========================================================================//
__device__ REAL OCFD_NND2_kernel_P(REAL *stencil){
REAL tmp = stencil[1] + 0.5*minmod2(stencil[2] - stencil[1], stencil[1] - stencil[0]);
return tmp;
}
__global__ void OCFD_NND2_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[5];
int ia1 = -2; int ib1 = 2;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_NND2_kernel_P(&stencil[1]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//----------------------------------------------------------------------------------------------------------------------------
__device__ REAL OCFD_NND2_kernel_M(REAL *stencil){
REAL tmp = stencil[1] + 0.5*minmod2(stencil[0] - stencil[1], stencil[1] - stencil[2]);
return tmp;
}
__global__ void OCFD_NND2_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[5];
int ia1 = -2; int ib1 = 2;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_NND2_kernel_M(&stencil[1]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//==================================================================================================================================
//===================================================================2order_NND========================================================================//
__device__ REAL OCFD_UP7_kernel_P(REAL *stencil){
//REAL tmp = (3.0*stencil[0] - 28.0*stencil[1] + 126.0*stencil[2] - 420.0*stencil[3] + 105.0*stencil[4]
// + 252.0*stencil[5] - 42.0*stencil[6] + 4.0*stencil[7])/420.0;
REAL tmp = (-3.0*stencil[0] + 25.0*stencil[1] - 101.0*stencil[2] + 319.0*stencil[3] + 214.0*stencil[4]
- 38.0*stencil[5] + 4.0*stencil[6])/420.0;
return tmp;
}
__global__ void OCFD_UP7_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int ia1 = -3; int ib1 = 4;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_UP7_kernel_P(&stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//----------------------------------------------------------------------------------------------------------------------------
__device__ REAL OCFD_UP7_kernel_M(REAL *stencil){
//REAL tmp = -(3.0*stencil[7] - 28.0*stencil[6] + 126.0*stencil[5] - 420.0*stencil[4] + 105.0*stencil[3]
// + 252.0*stencil[2] - 42.0*stencil[1] + 4.0*stencil[0])/420.0;
REAL tmp = (-3.0*stencil[7] + 25.0*stencil[6] - 101.0*stencil[5] + 319.0*stencil[4] + 214.0*stencil[3]
- 38.0*stencil[2] + 4.0*stencil[1])/420.0;
return tmp;
}
__global__ void OCFD_UP7_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int ia1 = -4; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_UP7_kernel_M(&stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//==================================================================================================================================
//===========================================================OMP6===================================================================
__device__ REAL OCFD_OMP6_kernel_P(int OMP6_FLAG, REAL *stencil){
REAL m,n;
if(OMP6_FLAG == 1){
m = 0.001; n = 0.0;
}else if(OMP6_FLAG == 2){
m = 0.0; n = - 1.0/140.0;
}else{
m = 0.015; n = 0.0;
}
REAL mid_nf = 0.5*(m + n); m = 0.5*(m - n);
mid_nf = ( 60.0*mid_nf * stencil[7] +
( 1.0 - 60.0*m - 360.0*mid_nf) * stencil[6] +
(-8.0 + 360.0*m + 900.0*mid_nf) * stencil[5] +
(37.0 - 900.0*m - 1200.0*mid_nf) * stencil[4] +
(37.0 + 1200.0*m + 900.0*mid_nf) * stencil[3] +
(-8.0 - 900.0*m - 360.0*mid_nf) * stencil[2] +
(1.0 + 360.0*m + 60.0*mid_nf) * stencil[1] -
60.0*m * stencil[0])/60.0;
m = stencil[3] + minmod2((stencil[4] - stencil[3]), (stencil[3] - stencil[2]));
if((mid_nf - stencil[3])*(mid_nf - m) >= 1.e-10){
REAL tmp, tmp1;
m = stencil[2] + stencil[4] - 2.0*stencil[3];
n = stencil[3] + stencil[5] - 2.0*stencil[4];
tmp = 4.0*m - n;
tmp1 = 4.0*n - m;
tmp = 0.5*(stencil[3] + stencil[4]) - 0.5*minmod4(tmp, tmp1, n, m);
n = stencil[1] + stencil[3] - 2.0*stencil[2];
tmp1 = stencil[3] + 0.5*(stencil[3] - stencil[2]) + 4.0*minmod4(4*n - m, 4*m - n, m, n)/3.0;
{
m = fmin(stencil[3], stencil[4]);
m = fmin(m, tmp);
n = stencil[3] + 4.0*(stencil[3] - stencil[2]);
n = fmin(stencil[3], n);
n = fmin(n, tmp1);
m = fmax(m, n);
}
{
tmp = fmax(stencil[3], tmp);
tmp = fmax(stencil[4], tmp);
n = stencil[3] + 4.0*(stencil[3] - stencil[2]);
n = fmax(stencil[3], n);
n = fmax(n, tmp1);
n = fmin(tmp, n);
}
mid_nf = mid_nf + minmod2(n - mid_nf, m - mid_nf);
}
return mid_nf;
}
__global__ void OCFD_OMP6_P_kernel(int i, int OMP6_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int ia1 = -3; int ib1 = 4;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_OMP6_kernel_P(OMP6_FLAG, &stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//---------------------------------------------------------OMP6_M---------------------------------------------------------------
__device__ REAL OCFD_OMP6_kernel_M(int OMP6_FLAG, REAL *stencil){
REAL m,n;
if(OMP6_FLAG == 1){
m = 0.001; n = 0.0;
}else if(OMP6_FLAG == 2){
m = 0.0; n = - 1.0/140.0;
}else{
m = 0.015; n = 0.0;
}
REAL mid_nf = 0.5*(m + n); m = 0.5*(m - n);
mid_nf = ( 60.0*mid_nf * stencil[0] +
(1.0 - 60.0*m - 360.0*mid_nf) * stencil[1] +
(-8.0 + 360.0*m + 900.0*mid_nf) * stencil[2] +
(37.0 - 900.0*m - 1200.0*mid_nf) * stencil[3] +
(37.0 + 1200.0*m + 900.0*mid_nf) * stencil[4] +
(-8.0 - 900.0*m - 360.0*mid_nf) * stencil[5] +
(1.0 + 360.0*m + 60.0*mid_nf) * stencil[6] -
60.0*m * stencil[7])/60.0;
m = stencil[4] + minmod2((stencil[3] - stencil[4]), (stencil[4] - stencil[5]));
if((mid_nf - stencil[4])*(mid_nf - m) >= 1.e-10){
REAL tmp, tmp1;
m = stencil[5] + stencil[3] - 2.0*stencil[4];
n = stencil[4] + stencil[2] - 2.0*stencil[3];
tmp = 4.0*m - n;
tmp1 = 4.0*n - m;
tmp = 0.5*(stencil[4] + stencil[3]) - 0.5*minmod4(tmp, tmp1, n, m);
n = stencil[6] + stencil[4] - 2.0*stencil[5];
tmp1 = stencil[4] + 0.5*(stencil[4] - stencil[3]) + 4.0*minmod4(4*n - m, 4*m - n, m, n)/3.0;
{
m = fmin(stencil[5], stencil[3]);
m = fmin(m, tmp);
n = stencil[4] + 4.0*(stencil[4] - stencil[5]);
n = fmin(stencil[4], n);
n = fmin(n, tmp1);
m = fmax(m, n);
}
{
tmp = fmax(stencil[4], tmp);
tmp = fmax(stencil[3], tmp);
n = stencil[4] + 4.0*(stencil[4] - stencil[5]);
n = fmax(stencil[4], n);
n = fmax(n, tmp1);
n = fmin(tmp, n);
}
mid_nf = mid_nf + minmod2(n - mid_nf, m - mid_nf);
}
return mid_nf;
}
__global__ void OCFD_OMP6_M_kernel(int i, int OMP6_FLAG, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int ia1 = -4; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_OMP6_kernel_M(OMP6_FLAG, &stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
__device__ REAL OCFD_weno5_kernel_P(REAL *stencil){
//-2 ---- 1
REAL S0 = 0.0, S1 = 0.0, S2 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[2] - 2.0*stencil[3] + stencil[4]; S0 += 13*tmp*tmp;
tmp = 3.0*stencil[2] - 4.0*stencil[3] + stencil[4]; S0 += 3*tmp*tmp;
REAL q03 = (2.0*stencil[2] + 5.0*stencil[3] - stencil[4]);
tmp = stencil[1] - 2.0*stencil[2] + stencil[3]; S1 += 13*tmp*tmp;
tmp = stencil[1] - stencil[3]; S1 += 3*tmp*tmp;
REAL q13 = (-stencil[1] + 5.0*stencil[2] + 2.0*stencil[3]);
tmp = stencil[0] - 2.0*stencil[1] + stencil[2]; S2 += 13*tmp*tmp;
tmp = stencil[0] - 4.0*stencil[1] + 3.0*stencil[2]; S2 += 3*tmp*tmp;
REAL q23 = (2.0*stencil[0] - 7.0*stencil[1] + 11.0*stencil[2]);
REAL a0 = 3.0*((12.0*ep + S1)*(12.0*ep + S1))*((12.0*ep + S2)*(12.0*ep + S2));
REAL a1 = 6.0*((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S2)*(12.0*ep + S2));
REAL a2 = ((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S1)*(12.0*ep + S1));
tmp = (a0*q03 + a1*q13 + a2*q23)/(6.0*(a0 + a1 + a2));
return tmp;
}
__global__ void OCFD_weno5_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[5];
int ia1 = -2; int ib1 = 2;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_weno5_kernel_P(&stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
__device__ REAL OCFD_weno5_kernel_M(REAL *stencil){
//-1 ----- 2
REAL S0 = 0.0, S1 = 0.0, S2 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[2] - 2.0*stencil[1] + stencil[0]; S0 += 13*tmp*tmp;
tmp = 3.0*stencil[2] - 4.0*stencil[1] + stencil[0]; S0 += 3*tmp*tmp;
REAL q03 = (2.0*stencil[2] + 5.0*stencil[1] - stencil[0]);
tmp = stencil[3] - 2.0*stencil[2] + stencil[1]; S1 += 13*tmp*tmp;
tmp = stencil[3] - stencil[1]; S1 += 3*tmp*tmp;
REAL q13 = (-stencil[3] + 5.0*stencil[2] + 2.0*stencil[1]);
tmp = stencil[4] - 2.0*stencil[3] + stencil[2]; S2 += 13*tmp*tmp;
tmp = stencil[4] - 4.0*stencil[3] + 3.0*stencil[2]; S2 += 3*tmp*tmp;
REAL q23 = (2.0*stencil[4] - 7.0*stencil[3] + 11.0*stencil[2]);
REAL a0 = 3.0*((12.0*ep + S1)*(12.0*ep + S1))*((12.0*ep + S2)*(12.0*ep + S2));
REAL a1 = 6.0*((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S2)*(12.0*ep + S2));
REAL a2 = ((12.0*ep + S0)*(12.0*ep + S0))*((12.0*ep + S1)*(12.0*ep + S1));
tmp = (a0*q03 + a1*q13 + a2*q23)/(6.0*(a0 + a1 + a2));
return tmp;
}
__global__ void OCFD_weno5_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[5];
int ia1 = -2; int ib1 = 2;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0) tmp_r = OCFD_weno5_kernel_M(&stencil[0]);
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
//-------------------------------------------------------------- CD6----------------------------------------------------------//
__global__ void OCFD_dx0_CD6_kernel(cudaField pf , cudaField pfx , cudaJobPackage job){
// eyes on cells WITH LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL x__3 = get_Field_LAP(pf, x-3, y, z);
REAL x__2 = get_Field_LAP(pf, x-2, y, z);
REAL x__1 = get_Field_LAP(pf, x-1, y, z);
REAL x_1 = get_Field_LAP(pf, x+1, y, z);
REAL x_2 = get_Field_LAP(pf, x+2, y, z);
REAL x_3 = get_Field_LAP(pf, x+3, y, z);
get_Field(pfx , x-LAP,y-LAP,z-LAP) = (
45.0*( x_1 - x__1 )
-9.0*( x_2 - x__2 )
+( x_3 - x__3 ) )
/(60.0*hx_d);
}
}
__global__ void OCFD_dy0_CD6_kernel(cudaField pf , cudaField pfy , cudaJobPackage job){
// eyes on cells WITH LAPs
unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
unsigned int y = blockDim.x * blockIdx.x + threadIdx.x + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL y__3 = get_Field_LAP(pf, x, y-3, z);
REAL y__2 = get_Field_LAP(pf, x, y-2, z);
REAL y__1 = get_Field_LAP(pf, x, y-1, z);
REAL y_1 = get_Field_LAP(pf, x, y+1, z);
REAL y_2 = get_Field_LAP(pf, x, y+2, z);
REAL y_3 = get_Field_LAP(pf, x, y+3, z);
get_Field(pfy , x-LAP,y-LAP,z-LAP) = (
45.0*( y_1 - y__1 )
-9.0*( y_2 - y__2 )
+( y_3 - y__3 ) )
/(60.0*hy_d);
}
}
__global__ void OCFD_dz0_CD6_kernel(cudaField pf , cudaField pfz , cudaJobPackage job){
unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
unsigned int y = blockDim.z * blockIdx.z + threadIdx.z + job.start.y;
unsigned int z = blockDim.x * blockIdx.x + threadIdx.x + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL z__3 = get_Field_LAP(pf, x, y, z-3);
REAL z__2 = get_Field_LAP(pf, x, y, z-2);
REAL z__1 = get_Field_LAP(pf, x, y, z-1);
REAL z_1 = get_Field_LAP(pf, x, y, z+1);
REAL z_2 = get_Field_LAP(pf, x, y, z+2);
REAL z_3 = get_Field_LAP(pf, x, y, z+3);
get_Field(pfz, x-LAP, y-LAP, z-LAP) = (
45.0*( z_1 - z__1 )
-9.0*( z_2 - z__2 )
+( z_3 - z__3 ) )
/(60.0*hz_d);
}
}
//__global__ void OCFD_dz0_CD6_kernel(cudaField pf , cudaField pfz , cudaJobPackage job){
// // eyes on cells WITH LAPs
// extern __shared__ REAL hh[];
// unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
// unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
// unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
// unsigned int id = threadIdx.z + (blockDim.z + 6)* (threadIdx.x + threadIdx.y * blockDim.x);
//
// if(threadIdx.z < 6) hh[id] = get_Field_LAP(pf, x, y, z-3);
//
// if(x < job.end.x && y < job.end.y && z < job.end.z){
//
// hh[id + 6] = get_Field_LAP(pf, x, y, z+3);
// __syncthreads();
//
// get_Field(pfz , x-LAP,y-LAP,z-LAP) = (
// 45.0*( hh[id + 4] - hh[id + 2] )
// -9.0*( hh[id + 5] - hh[id + 1] )
// +( hh[id + 6] - hh[id ] ) )
// /(60.0*hz_d);
// }
//}
//===================================================================================================================================//
//-----------------------------------------------------------------CD8---------------------------------------------------------------//
__global__ void OCFD_dx0_CD8_kernel(cudaField pf , cudaField pfx , cudaJobPackage job){
// eyes on cells WITH LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL x__4 = get_Field_LAP(pf, x-4, y, z);
REAL x__3 = get_Field_LAP(pf, x-3, y, z);
REAL x__2 = get_Field_LAP(pf, x-2, y, z);
REAL x__1 = get_Field_LAP(pf, x-1, y, z);
REAL x_1 = get_Field_LAP(pf, x+1, y, z);
REAL x_2 = get_Field_LAP(pf, x+2, y, z);
REAL x_3 = get_Field_LAP(pf, x+3, y, z);
REAL x_4 = get_Field_LAP(pf, x+4, y, z);
get_Field(pfx, x-LAP, y-LAP, z-LAP) = (
672.*( x_1 - x__1 )
-168*( x_2 - x__2 )
+32.*( x_3 - x__3 )
-3*( x_4 - x__4 ) )
/(840.0*hx_d);
}
}
__global__ void OCFD_dy0_CD8_kernel(cudaField pf , cudaField pfy , cudaJobPackage job){
// eyes on cells WITH LAPs
unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
unsigned int y = blockDim.x * blockIdx.x + threadIdx.x + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL y__4 = get_Field_LAP(pf, x, y-4, z);
REAL y__3 = get_Field_LAP(pf, x, y-3, z);
REAL y__2 = get_Field_LAP(pf, x, y-2, z);
REAL y__1 = get_Field_LAP(pf, x, y-1, z);
REAL y_1 = get_Field_LAP(pf, x, y+1, z);
REAL y_2 = get_Field_LAP(pf, x, y+2, z);
REAL y_3 = get_Field_LAP(pf, x, y+3, z);
REAL y_4 = get_Field_LAP(pf, x, y+4, z);
get_Field(pfy , x-LAP,y-LAP,z-LAP) = (
672.0*( y_1 - y__1 )
-168.0*( y_2 - y__2 )
+32.*( y_3 - y__3 )
-3.*( y_4 - y__4 ) )
/(840.0*hy_d);
}
}
__global__ void OCFD_dz0_CD8_kernel(cudaField pf , cudaField pfz , cudaJobPackage job){
// eyes on cells WITH LAPs
unsigned int x = blockDim.y * blockIdx.y + threadIdx.y + job.start.x;
unsigned int y = blockDim.z * blockIdx.z + threadIdx.z + job.start.y;
unsigned int z = blockDim.x * blockIdx.x + threadIdx.x + job.start.z;
if(x < job.end.x && y < job.end.y && z<job.end.z){
REAL z__4 = get_Field_LAP(pf, x, y, z-4);
REAL z__3 = get_Field_LAP(pf, x, y, z-3);
REAL z__2 = get_Field_LAP(pf, x, y, z-2);
REAL z__1 = get_Field_LAP(pf, x, y, z-1);
REAL z_1 = get_Field_LAP(pf, x, y, z+1);
REAL z_2 = get_Field_LAP(pf, x, y, z+2);
REAL z_3 = get_Field_LAP(pf, x, y, z+3);
REAL z_4 = get_Field_LAP(pf, x, y, z+4);
get_Field(pfz , x-LAP,y-LAP,z-LAP) = (
672.0*( z_1 - z__1 )
-168.0*( z_2 - z__2 )
+32.*( z_3 - z__3 )
-3.*( z_4 - z__4 ) )
/(840.0*hz_d);
}
}
#ifdef __cplusplus
}
#endif
#include <math.h>
#include "parameters.h"
#include "utility.h"
#include "OCFD_Schemes.h"
#include "OCFD_Schemes_Choose.h"
#include "OCFD_Schemes_hybrid_auto.h"
#include "OCFD_bound_Scheme.h"
#include "OCFD_flux_charteric.h"
#include "parameters_d.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#ifdef __cplusplus
extern "C"{
#endif
// Used in viscous Jacobian --------------------------------------------------------------------------------------------
void OCFD_dx0(cudaField pf, cudaField pfx, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
dim3 flagxyzb(1, 0, 0);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
cal_grid_block_dim(&griddim, &blockdim, 8, 8, 4, size.x, size.y, size.z);
switch(Scheme_vis_ID){
case 203:
CUDA_LAUNCH((OCFD_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfx, job_in) ));
break;
case 204:
CUDA_LAUNCH((OCFD_CD8_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfx, job_in) ));
break;
}
}
void OCFD_dy0(cudaField pf, cudaField pfy, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
dim3 flagxyzb(2, 0, 0);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z );
switch(Scheme_vis_ID){
case 203:
CUDA_LAUNCH((OCFD_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfy, job_in) ));
break;
case 204:
CUDA_LAUNCH((OCFD_CD8_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfy, job_in) ));
break;
}
}
void OCFD_dz0(cudaField pf, cudaField pfz, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
dim3 flagxyzb(3, 0, 0);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x, blockdim_in.y, blockdim_in.z, size.x, size.z, size.y );
switch(Scheme_vis_ID){
case 203:
CUDA_LAUNCH((OCFD_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfz, job_in) ));
break;
case 204:
CUDA_LAUNCH((OCFD_CD8_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pfz, job_in) ));
break;
}
}
// Used in inviscous Jacobian flux+ ------------------------------------------------------------------------------------------
void OCFD_dx1(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc,
cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
// field with LAPs
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x-1, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z );
dim3 flagxyzb(1, 0, Non_ref[0]);//.x正向边界;.y负向边界;.z无反射边界
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
job_in.start.x -= 1;
blockdim.x += 1;
if(IF_CHARTERIC == 1){
switch(Scheme_invis_ID){
case 301:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 302:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 303:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 0, *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 0, *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
break;
case 306:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 307:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 308:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 309:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_character_P_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_character_P_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
}
break;
}
}else{
for(int i=0; i<5; i++){
switch(Scheme_invis_ID){
case 301:
CUDA_LAUNCH((OCFD_UP7_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 302:
CUDA_LAUNCH((OCFD_weno5_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 303:
CUDA_LAUNCH((OCFD_weno7_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH(( OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
break;
case 306:
CUDA_LAUNCH((OCFD_NND2_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 307:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 308:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 309:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_P_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_P_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
}
break;
}
}
}
}
void OCFD_dy1(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc,
cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.y, size.z);
dim3 flagxyzb(2, 0, Non_ref[2]);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
job_in.start.y -= 1;
blockdim.y += 1;
if(IF_CHARTERIC == 1){
switch(Scheme_invis_ID){
case 301:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 302:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 303:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
break;
case 306:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 307:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 308:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 309:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_character_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
}
break;
}
}else{
for(int i=0; i<5; i++){
switch(Scheme_invis_ID){
case 301:
CUDA_LAUNCH((OCFD_UP7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 302:
CUDA_LAUNCH((OCFD_weno5_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 303:
CUDA_LAUNCH((OCFD_weno7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH(( OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
break;
case 306:
CUDA_LAUNCH((OCFD_NND2_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 307:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 308:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 309:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
}
break;
}
}
}
}
// Used in inviscous Jacobian flux- ------------------------------------------------------------------------------------------
void OCFD_dz1(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc,
cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.z, size.y);
dim3 flagxyzb(3, 0, Non_ref[4]);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
job_in.start.z -= 1;
blockdim.y += 1;
if(IF_CHARTERIC == 1){
switch(Scheme_invis_ID){
case 301:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 302:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 303:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
break;
case 306:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 307:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 308:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 309:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_character_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_character_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
}
break;
}
}else{
for(int i=0; i<5; i++){
switch(Scheme_invis_ID){
case 301:
CUDA_LAUNCH((OCFD_UP7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 302:
CUDA_LAUNCH((OCFD_weno5_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 303:
CUDA_LAUNCH((OCFD_weno7_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH(( OCFD_weno7_SYMBO_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
break;
case 306:
CUDA_LAUNCH((OCFD_NND2_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 307:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 308:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 309:
CUDA_LAUNCH((OCFD_OMP6_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_P_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_P_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
}
break;
}
}
}
}
void OCFD_dx2(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc,
cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
// field with LAPs
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x-1, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z );
dim3 flagxyzb(4, 0, Non_ref[1]);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
job_in.end.x += 1;
blockdim.x += 1;
if(IF_CHARTERIC == 1){
switch(Scheme_invis_ID){
case 301:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 302:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 303:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 0, *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 0, *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
break;
case 306:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 307:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 308:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 309:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_character_M_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_character_M_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_x, job_in) ));
}
break;
}
}else{
for(int i=0; i<5; i++){
switch(Scheme_invis_ID){
case 301:
CUDA_LAUNCH((OCFD_UP7_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 302:
CUDA_LAUNCH((OCFD_weno5_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 303:
CUDA_LAUNCH((OCFD_weno7_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
break;
case 306:
CUDA_LAUNCH((OCFD_NND2_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 307:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 308:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 309:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_M_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_M_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_x, job_in) ));
}
break;
}
}
}
}
void OCFD_dy2(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc,
cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in , &size);
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.y, size.z);
dim3 flagxyzb(5, 0, Non_ref[3]);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
job_in.end.y += 1;
blockdim.y += 1;
if(IF_CHARTERIC == 1){
switch(Scheme_invis_ID){
case 301:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 302:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 303:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
break;
case 306:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 307:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 308:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 309:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_character_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_y, job_in) ));
}
break;
}
}else{
for(int i=0; i<5; i++){
switch(Scheme_invis_ID){
case 301:
CUDA_LAUNCH((OCFD_UP7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 302:
CUDA_LAUNCH((OCFD_weno5_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 303:
CUDA_LAUNCH((OCFD_weno7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
break;
case 306:
CUDA_LAUNCH((OCFD_NND2_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 307:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 308:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 309:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_y, job_in) ));
}
break;
}
}
}
}
void OCFD_dz2(cudaSoA pf, cudaSoA pdu, cudaField Ajac, cudaField u, cudaField v, cudaField w, cudaField cc,
cudaField Ax, cudaField Ay, cudaField Az, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int boundl, int boundr){
dim3 size;
jobsize(&job_in , &size);
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim, &blockdim, 8, 7, 4, size.x, size.z, size.y);
dim3 flagxyzb(6, 0, Non_ref[5]);
OCFD_bound(&flagxyzb, boundl, boundr, job_in);
job_in.end.z += 1;
blockdim.y += 1;
if(IF_CHARTERIC == 1){
switch(Scheme_invis_ID){
case 301:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 302:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 303:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(0, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(1, flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, job_in) ));//weno7_symbo_limiter
break;
case 306:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 307:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 308:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 309:
if(my_id == 0) printf("This scheme does not support charteric flux reconstruction\n");
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_character_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_character_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(flagxyzb, pf, pdu, Ajac, u, v, w, cc, Ax, Ay, Az, *HybridAuto.scheme_z, job_in) ));
}
break;
}
}else{
for(int i=0; i<5; i++){
switch(Scheme_invis_ID){
case 301:
CUDA_LAUNCH((OCFD_UP7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 302:
CUDA_LAUNCH((OCFD_weno5_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 303:
CUDA_LAUNCH((OCFD_weno7_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 304:
CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo
break;
case 305:
CUDA_LAUNCH((OCFD_weno7_SYMBO_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));//weno7_symbo_limiter
break;
case 306:
CUDA_LAUNCH((OCFD_NND2_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 307:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 0, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 308:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 1, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 309:
CUDA_LAUNCH((OCFD_OMP6_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, 2, flagxyzb, pf, pdu, Ajac, job_in) ));
break;
case 310:
if(HybridAuto.Style == 1){
CUDA_LAUNCH((OCFD_HybridAuto_M_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
}else if(HybridAuto.Style == 2){
CUDA_LAUNCH((OCFD_HybridAuto_M_Jameson_kernel<<<griddim, blockdim, 16*8*4*sizeof(REAL), *stream>>>(i, flagxyzb, pf, pdu, Ajac, *HybridAuto.scheme_z, job_in) ));
}
break;
}
}
}
}
void OCFD_dx0_jac(cudaField pf, cudaField pfx, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int bound){
// field with LAPs
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.x, blockdim_in.y, blockdim_in.z, size.x, size.y, size.z);
CUDA_LAUNCH(( OCFD_dx0_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(pf, pfx, job_in) ));
if(bound != 0){
OCFD_Dx0_bound(pf, pfx, job_in, blockdim_in, stream);
}
}
void OCFD_dy0_jac(cudaField pf, cudaField pfy, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int bound){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.y, blockdim_in.x, blockdim_in.z, size.y, size.x, size.z );
CUDA_LAUNCH(( OCFD_dy0_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(pf, pfy, job_in) ));
if(bound != 0){
OCFD_Dy0_bound(pf, pfy, job_in, blockdim_in, stream);
}
}
void OCFD_dz0_jac(cudaField pf, cudaField pfz, cudaJobPackage job_in, dim3 blockdim_in, cudaStream_t *stream, int bound){
dim3 size;
jobsize(&job_in, &size);
dim3 griddim, blockdim;
cal_grid_block_dim(&griddim, &blockdim, blockdim_in.z, blockdim_in.x, blockdim_in.y, size.z, size.x, size.y );
CUDA_LAUNCH(( OCFD_dz0_CD6_kernel<<<griddim, blockdim, 0, *stream>>>(pf, pfz, job_in) ));
if(bound != 0){
OCFD_Dz0_bound(pf, pfz, job_in, blockdim_in, stream);
}
}
#ifdef __cplusplus
}
#endif
#include <math.h>
#include "parameters.h"
#include "cuda_commen.h"
#include "commen_kernel.h"
#include "parameters_d.h"
#include "OCFD_warp_shuffle.h"
#include "cuda_utility.h"
#include "OCFD_Schemes_hybrid_auto.h"
#include "OCFD_Schemes_Choose.h"
#include "OCFD_bound_Scheme.h"
#include "OCFD_Schemes.h"
#include "OCFD_mpi_dev.h"
#include "OCFD_mpi.h"
#include "OCFD_IO_mpi.h"
#ifdef __cplusplus
extern "C"{
#endif
void Set_Scheme_HybridAuto(cudaStream_t *stream){
Comput_P(pd_d, pT_d, pP_d, stream);
if(HybridAuto.Style == 1){
Comput_grad(pP_d, stream);
modify_NT(stream);
if(HybridAuto.IF_Smooth_dp == 1) Smoothing_dp(stream);
Patch_zones(stream);
Boundary_dp(stream);
Comput_Scheme_point(stream);
}else if(HybridAuto.Style == 2){
Comput_Scheme_point_Jameson(stream);
}
}
//---------------------------------------------------Comput_P--------------------------------------------------------
__global__ void Comput_P_kernel(cudaField d, cudaField T, cudaField P, REAL p00, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(P, x, y, z) = p00 * get_Field_LAP(d, x, y, z) * get_Field_LAP(T, x, y, z);
}
}
void Comput_P(cudaField *d, cudaField *T, cudaField *P, cudaStream_t *stream){
cudaJobPackage job(dim3(0, 0, 0) , dim3(nx+2*LAP, ny+2*LAP, nz+2*LAP));
dim3 size, griddim, blockdim;
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
REAL p00 = 1.0/(Gamma*Ama*Ama);
CUDA_LAUNCH((Comput_P_kernel<<<griddim, blockdim, 0, *stream>>>(*d, *T, *P, p00, job)));
}
//--------------------------------------------------------------------------------------------------------------------
//----------------------------------------------------Comput_grad_P----------------------------------------------------
__device__ REAL warpReduce(REAL mySum){
mySum += __shfl_xor_double(mySum, 32, warpSize);
mySum += __shfl_xor_double(mySum, 16, warpSize);
mySum += __shfl_xor_double(mySum, 8, warpSize);
mySum += __shfl_xor_double(mySum, 4, warpSize);
mySum += __shfl_xor_double(mySum, 2, warpSize);
mySum += __shfl_xor_double(mySum, 1, warpSize);
return mySum;
}
__global__ void Comput_grad1_kernel(cudaField pk, cudaField pi, cudaField ps, cudaField Akx, cudaField Aky,
cudaField Akz, cudaField Aix, cudaField Aiy, cudaField Aiz, cudaField Asx, cudaField Asy, cudaField Asz,
int SMEMDIM, cudaField grad_f, REAL *g_odata, cudaJobPackage job){
extern __shared__ REAL shared[];
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int Id = blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
unsigned int warpId = Id / warpSize;
unsigned int laneIdx = Id % warpSize;
REAL grad_f0 = 0.;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL px = get_Field(pk, x, y, z) * get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)
+ get_Field(pi, x, y, z) * get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)
+ get_Field(ps, x, y, z) * get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
REAL py = get_Field(pk, x, y, z) * get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)
+ get_Field(pi, x, y, z) * get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)
+ get_Field(ps, x, y, z) * get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
REAL pz = get_Field(pk, x, y, z) * get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)
+ get_Field(pi, x, y, z) * get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)
+ get_Field(ps, x, y, z) * get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
get_Field(grad_f, x, y, z) = grad_f0 = sqrt(px*px + py*py + pz*pz);
}
grad_f0 = warpReduce(grad_f0);
if(laneIdx == 0) shared[warpId] = grad_f0;
__syncthreads();
grad_f0 = (Id < SMEMDIM)?shared[Id]:0;
if(warpId == 0) grad_f0 = warpReduce(grad_f0);
if(Id == 0) g_odata[blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z] = grad_f0;
}
__global__ void add_kernel(REAL *g_odata, int g_odata_size){
extern __shared__ REAL shared[];
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int warpId = threadIdx.x / warpSize;
unsigned int laneIdx = threadIdx.x % warpSize;
REAL grad_f0 = 0.;
if(x < g_odata_size) grad_f0 = g_odata[x];
grad_f0 = warpReduce(grad_f0);
if(laneIdx == 0) shared[warpId] = grad_f0;
__syncthreads();
grad_f0 = (threadIdx.x < 8)?shared[laneIdx]:0;
if(warpId == 0) grad_f0 = warpReduce(grad_f0);
if(x >= gridDim.x) g_odata[x] = 0.0;
if(threadIdx.x == 0) g_odata[blockIdx.x] = grad_f0;
}
__global__ void Comput_grad2_kernel(cudaField grad_f, REAL grad_f_av1, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field(grad_f, x, y, z) = get_Field(grad_f, x, y, z)*grad_f_av1;
}
}
void Comput_grad(cudaField *P, cudaStream_t *stream){
cudaField Pk_d, Pi_d, Ps_d;
Pk_d.pitch = pdu_d->pitch; Pk_d.ptr = pdu_d->ptr;
Pi_d.pitch = pdu_d->pitch; Pi_d.ptr = pdu_d->ptr + pdu_d->pitch*ny*nz;
Ps_d.pitch = pdu_d->pitch; Ps_d.ptr = pdu_d->ptr + 2 * pdu_d->pitch*ny*nz;
grad_P.pitch = pdu_d->pitch; grad_P.ptr = pdu_d->ptr + 3 * pdu_d->pitch*ny*nz;
cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
OCFD_dx0(*P, Pk_d, job, BlockDim_X, stream, D0_bound[0], D0_bound[1]);
OCFD_dy0(*P, Pi_d, job, BlockDim_Y, stream, D0_bound[2], D0_bound[3]);
OCFD_dz0(*P, Ps_d, job, BlockDim_Z, stream, D0_bound[4], D0_bound[5]);
dim3 size, griddim, blockdim;
job.setup(dim3(0, 0, 0), dim3(nx, ny, nz));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
REAL *g_odata;
REAL *Sum = (REAL *)malloc(sizeof(REAL));
unsigned int g_odata_size = griddim.x*griddim.y*griddim.z;
CUDA_LAUNCH(( cudaMalloc((REAL **)&g_odata, g_odata_size*sizeof(REAL)) ));
int SMEMDIM = blockdim.x*blockdim.y*blockdim.z/64; //Warpsize is 64
CUDA_LAUNCH((Comput_grad1_kernel<<<griddim, blockdim, SMEMDIM, *stream>>>(Pk_d, Pi_d, Ps_d, *pAkx_d, *pAky_d,
*pAkz_d, *pAix_d, *pAiy_d, *pAiz_d, *pAsx_d, *pAsy_d, *pAsz_d, SMEMDIM, grad_P, g_odata, job)));
dim3 blockdim_sum(512);
dim3 griddim_sum(g_odata_size);
do{
griddim_sum.x = (griddim_sum.x + blockdim_sum.x - 1)/blockdim_sum.x;
CUDA_LAUNCH(( add_kernel<<<griddim_sum, blockdim_sum, 8, *stream>>>(g_odata, g_odata_size) ));
} while(griddim_sum.x > 1);
CUDA_LAUNCH(( cudaMemcpy(Sum, g_odata, sizeof(REAL), cudaMemcpyDeviceToHost) ));
CUDA_LAUNCH(( cudaFree(g_odata) ));
REAL grad_f_av, grad_f_av1;
MPI_Allreduce(Sum, &grad_f_av, 1, OCFD_DATA_TYPE, MPI_SUM, MPI_COMM_WORLD);
grad_f_av = grad_f_av/(NX_GLOBAL * NY_GLOBAL * NZ_GLOBAL);
grad_f_av1 = 1.0/grad_f_av;
CUDA_LAUNCH((Comput_grad2_kernel<<<griddim, blockdim, 0, *stream>>>(grad_P, grad_f_av1, job)));
}
//----------------------------------------------------------------------------------------------------------
//---------------------------------------------Modify Negative T--------------------------------------------
__global__ void ana_NT_kernel(cudaField T, cudaField grad_f, REAL P_intvs, cudaJobPackage job){
// field with LAP
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL t;
t = get_Field_LAP(T , x,y,z);
if(t < 0){
t = get_Field_LAP(T , x-1 , y , z) + get_Field_LAP(T , x+1 , y , z)
+get_Field_LAP(T , x , y-1 , z) + get_Field_LAP(T , x , y+1 , z)
+get_Field_LAP(T , x , y , z-1) + get_Field_LAP(T , x , y , z+1);
get_Field_LAP(T , x,y,z) = t/6.0;
get_Field(grad_f, x, y, z) = fmax(10.*get_Field_LAP(grad_f, x, y, z), P_intvs + 1.);
}
}
}
void modify_NT(cudaStream_t *stream){
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
ana_NT_kernel<<<griddim , blockdim, 0, *stream>>>(*pT_d, grad_P, HybridAuto.P_intvs[1], job);
}
//----------------------------------------------Smoothing_dp------------------------------------------------
__global__ void Modify_P_kernel(cudaField f, cudaField grad_f, REAL P_intvs, cudaJobPackage job){
unsigned int x = (blockDim.x * blockIdx.x + threadIdx.x) + job.start.x;
unsigned int y = (blockDim.y * blockIdx.y + threadIdx.y) + job.start.y;
unsigned int z = (blockDim.z * blockIdx.z + threadIdx.z) + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL ff;
get_Field_LAP(f, x, y, z) = ff = get_Field(grad_f, x-LAP, y-LAP, z-LAP);
if(ff >= P_intvs) get_Field_LAP(f, x, y, z) = 3*ff;
}
}
__global__ void Modify_grad_inner_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field(grad_f, x-LAP, y-LAP, z-LAP) = get_Field_LAP(f, x, y, z)/3.0
+ (get_Field_LAP(f, x+1, y, z) + get_Field_LAP(f, x-1, y, z)
+ get_Field_LAP(f, x, y+1, z) + get_Field_LAP(f, x, y-1, z)
+ get_Field_LAP(f, x, y, z+1) + get_Field_LAP(f, x, y, z-1))/9.0;
}
}
__global__ void Modify_grad_outer_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field(grad_f, x, y, z) = get_Field_LAP(f, x+LAP, y+LAP, z+LAP);
}
}
void Smoothing_dp(cudaStream_t *stream){
REAL P_intvs = HybridAuto.P_intvs[HybridA_Stage - 1];
cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
dim3 size, griddim, blockdim;
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Modify_P_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, P_intvs, job) ));
exchange_boundary_xyz_Async_packed_dev(pP, pPP_d, stream);
CUDA_LAUNCH(( Modify_grad_inner_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job) ))
//---------------------------------------------------------------------------------------------
if (npx == 0 && Iperiodic[0] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(0, 0, 0), dim3(1, ny, nz));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
}
if (npx == NPX0 - 1 && Iperiodic[0] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(nx-1, 0, 0), dim3(nx, ny, nz));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
}
//---------------------------------------------------------------------------------------------
if (npy == 0 && Iperiodic[1] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(0, 0, 0), dim3(nx, 1, nz));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
}
if (npy == NPY0 - 1 && Iperiodic[1] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(0, ny-1, 0), dim3(nx, ny, nz));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
}
//----------------------------------------------------------------------------------------------
if (npz == 0 && Iperiodic[2] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(0, 0, 0), dim3(nx, ny, 1));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
}
if (npz == NPZ0 - 1 && Iperiodic[2] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(0, 0, nz-1), dim3(nx, ny, nz));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
CUDA_LAUNCH(( Modify_grad_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job_outer) ))
}
}
//-------------------------------------------------------------------------------------------------------------------
//----------------------------------------------------Patch_zones----------------------------------------------------
__global__ void Patch_zones_kernel(cudaField grad_f, REAL Pa_zones, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field(grad_f, x, y, z) = Pa_zones;
}
}
void Patch_zones(cudaStream_t *stream){
int node_ib, node_ie, node_jb, node_je, node_kb, node_ke;
int ib, ie, jb, je, kb, ke;
for(int i = 0; i < HybridAuto.Num_Patch_zones; i++){
int (*HybridAuto_zones)[6] = (int(*)[6])HybridAuto.zones;
int flag_i = 1, flag_j = 1, flag_k = 1;
get_i_node(HybridAuto_zones[i][0], &node_ib, &ib);
get_i_node(HybridAuto_zones[i][1], &node_ie, &ie);
get_j_node(HybridAuto_zones[i][2], &node_jb, &jb);
get_j_node(HybridAuto_zones[i][3], &node_je, &je);
get_k_node(HybridAuto_zones[i][4], &node_kb, &kb);
get_k_node(HybridAuto_zones[i][5], &node_ke, &ke);
if(node_ib < npx) ib = 0;
if(node_ib > npx) flag_i = 0;
if(node_ie > npx) ie = nx;
if(node_ie < npx) flag_i = 0;
if(node_jb < npy) jb = 0;
if(node_jb > npy) flag_j = 0;
if(node_je > npy) je = ny;
if(node_je < npy) flag_j = 0;
if(node_kb < npz) kb = 0;
if(node_kb > npz) flag_k = 0;
if(node_ke > npz) ke = nz;
if(node_ke < npz) flag_k = 0;
if(flag_i*flag_j*flag_k != 0){
cudaJobPackage job(dim3(ib, jb, kb) , dim3(ie, je, ke));
dim3 size, griddim, blockdim;
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
REAL Pa_zones = HybridAuto.Pa_zones[i];
CUDA_LAUNCH(( Patch_zones_kernel<<<griddim, blockdim, 0, *stream>>>(grad_P, Pa_zones, job) ));
}
}
}
//---------------------------------------------------------------------------------------------------------------
//----------------------------------------------Boundary_dp-----------------------------------------------------
__global__ void Modify_P_all_kernel(cudaField f, cudaField grad_f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + f.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
get_Field_LAP(f, x, y, z, offset) = get_Field(grad_f, x, y, z);
}
}
__global__ void Modify_x_P_outer_kernel(cudaField f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(f, x-1, y, z) = get_Field_LAP(f, x, y, z);
}
}
__global__ void Modify_x_M_outer_kernel(cudaField f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(f, x+1, y, z) = get_Field_LAP(f, x, y, z);
}
}
__global__ void Modify_y_P_outer_kernel(cudaField f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(f, x, y-1, z) = get_Field_LAP(f, x, y, z);
}
}
__global__ void Modify_y_M_outer_kernel(cudaField f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(f, x, y+1, z) = get_Field_LAP(f, x, y, z);
}
}
__global__ void Modify_z_P_outer_kernel(cudaField f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(f, x, y, z-1) = get_Field_LAP(f, x, y, z);
}
}
__global__ void Modify_z_M_outer_kernel(cudaField f, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(f, x, y, z+1) = get_Field_LAP(f, x, y, z);
}
}
void Boundary_dp(cudaStream_t *stream){
cudaJobPackage job(dim3(LAP, LAP, LAP) , dim3(nx+LAP, ny+LAP, nz+LAP));
dim3 size, griddim, blockdim;
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Modify_P_all_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, grad_P, job) ));
exchange_boundary_xyz_Async_packed_dev(pP, pPP_d, stream);
//---------------------------------------------------------------------------------------------
if (npx == 0 && Iperiodic[0] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(LAP+1, ny+LAP, nz+LAP));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
CUDA_LAUNCH(( Modify_x_P_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
}
if (npx == NPX0 - 1 && Iperiodic[0] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(nx+LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, 1, ny, nz);
CUDA_LAUNCH(( Modify_x_M_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
}
//---------------------------------------------------------------------------------------------
if (npy == 0 && Iperiodic[1] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(nx+LAP, LAP+1, nz+LAP));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
CUDA_LAUNCH(( Modify_y_P_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
}
if (npy == NPY0 - 1 && Iperiodic[1] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(LAP, ny+LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, 1, nz);
CUDA_LAUNCH(( Modify_y_M_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
}
//----------------------------------------------------------------------------------------------
if (npz == 0 && Iperiodic[2] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(LAP, LAP, LAP), dim3(nx+LAP, ny+LAP, LAP+1));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
CUDA_LAUNCH(( Modify_z_P_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
}
if (npz == NPZ0 - 1 && Iperiodic[2] == 0)
{
dim3 griddim, blockdim;
cudaJobPackage job_outer(dim3(LAP, LAP, nz+LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, 1);
CUDA_LAUNCH(( Modify_z_M_outer_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, job_outer) ))
}
}
//---------------------------------------------------------------------------------------------------------------------
//-----------------------------------------------Comput_Scheme_point---------------------------------------------------
__global__ void Comput_Scheme_point_x_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
int kp = 1;
if(dp0 > P_intvs1) kp += 1;
if(dp0 > P_intvs2) kp += 1;
*(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
}
}
__global__ void Comput_Scheme_point_y_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
int kp = 1;
if(dp0 > P_intvs1) kp += 1;
if(dp0 > P_intvs2) kp += 1;
*(scheme.ptr + (x + scheme.pitch *(y + (z)*(ny_d + 1)))) = kp;
}
}
__global__ void Comput_Scheme_point_z_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
REAL dp0 = 0.5 * (get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
int kp = 1;
if(dp0 > P_intvs1) kp += 1;
if(dp0 > P_intvs2) kp += 1;
*(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
}
}
void Comput_Scheme_point(cudaStream_t *stream){
dim3 size, griddim, blockdim;
cudaJobPackage job(dim3(LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Comput_Scheme_point_x_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, *HybridAuto.scheme_x, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
job.setup(dim3(LAP, LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Comput_Scheme_point_y_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, *HybridAuto.scheme_y, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
job.setup(dim3(LAP, LAP, LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Comput_Scheme_point_z_kernel<<<griddim, blockdim, 0, *stream>>>(*pPP_d, *HybridAuto.scheme_z, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
}
__global__ void Comput_Scheme_point_x_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
(get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
(get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
(get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
int kp = 1;
if(dp0 > P_intvs1) kp += 1;
if(dp0 > P_intvs2) kp += 1;
*(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
}
}
__global__ void Comput_Scheme_point_y_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
(get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
(get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
(get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
int kp = 1;
if(dp0 > P_intvs1) kp += 1;
if(dp0 > P_intvs2) kp += 1;
*(scheme.ptr + (x + scheme.pitch *(y + (z)*(ny_d + 1)))) = kp;
}
}
__global__ void Comput_Scheme_point_z_Jameson_kernel(cudaField P, cudaField_int scheme, REAL P_intvs1, REAL P_intvs2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int offset = job.start.x + P.pitch*(job.start.y + ny_2lap_d*job.start.z);
if(x < (job.end.x - job.start.x) && y < (job.end.y - job.start.y) && z < (job.end.z - job.start.z)){
REAL dp0 = fabs(-get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x+1, y, z, offset))/
(get_Field_LAP(P, x-1, y, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x+1, y, z, offset));
dp0 += fabs(-get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y+1, z, offset))/
(get_Field_LAP(P, x, y-1, z, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y+1, z, offset));
dp0 += fabs(-get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) - get_Field_LAP(P, x, y, z+1, offset))/
(get_Field_LAP(P, x, y, z-1, offset) + 2*get_Field_LAP(P, x, y, z, offset) + get_Field_LAP(P, x, y, z+1, offset));
int kp = 1;
if(dp0 > P_intvs1) kp += 1;
if(dp0 > P_intvs2) kp += 1;
*(scheme.ptr + (x + scheme.pitch *(y + (z)*ny_d))) = kp;
}
}
void Comput_Scheme_point_Jameson(cudaStream_t *stream){
dim3 size, griddim, blockdim;
cudaJobPackage job(dim3(LAP-1, LAP, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Comput_Scheme_point_x_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(*pP_d, *HybridAuto.scheme_x, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
job.setup(dim3(LAP, LAP-1, LAP), dim3(nx+LAP, ny+LAP, nz+LAP));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Comput_Scheme_point_y_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(*pP_d, *HybridAuto.scheme_y, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
job.setup(dim3(LAP, LAP, LAP-1), dim3(nx+LAP, ny+LAP, nz+LAP));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
CUDA_LAUNCH(( Comput_Scheme_point_z_Jameson_kernel<<<griddim, blockdim, 0, *stream>>>(*pP_d, *HybridAuto.scheme_z, HybridAuto.P_intvs[0], HybridAuto.P_intvs[1], job) ));//HybridA_Stage == 3
}
void HybridAuto_scheme_IO(){
memcpy_All_int(scheme_x, HybridAuto.scheme_x->ptr, HybridAuto.scheme_x->pitch, D2H, nx+1, ny, nz);
memcpy_All_int(scheme_y, HybridAuto.scheme_y->ptr, HybridAuto.scheme_y->pitch, D2H, nx, ny+1, nz);
memcpy_All_int(scheme_z, HybridAuto.scheme_z->ptr, HybridAuto.scheme_z->pitch, D2H, nx, ny, nz+1);
memcpy_All(pP, pPP_d->ptr , pPP_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
FILE *fp;
char fp_name[120];
if(my_id == 0){
sprintf(fp_name, "Scheme_x%08d.dat", Istep);
fp = fopen(fp_name, "w");
fprintf(fp, "variables=scheme\n");
fprintf(fp, "zone i=%d, j=%d\n", NX_GLOBAL, NY_GLOBAL);
}
write_2d_XY(fp, NZ_GLOBAL/2, nx+1, ny, 0, scheme_x, pP);
if(my_id == 0){
fclose(fp);
sprintf(fp_name, "Scheme_y%08d.dat", Istep);
fp = fopen(fp_name, "w");
fprintf(fp, "variables=scheme\n");
fprintf(fp, "zone i=%d, j=%d\n", NX_GLOBAL, NY_GLOBAL);
}
write_2d_XY(fp, NZ_GLOBAL/2, nx, ny+1, 0, scheme_y, pP);
if(my_id == 0){
fclose(fp);
sprintf(fp_name, "Scheme_z%08d.dat", Istep);
fp = fopen(fp_name, "w");
fprintf(fp, "variables=scheme\n");
fprintf(fp, "zone i=%d, j=%d\n", NX_GLOBAL, NY_GLOBAL);
}
write_2d_XY(fp, NZ_GLOBAL/2, nx, ny, 0, scheme_z, pP);
if(my_id == 0) fclose(fp);
}
void HybridAuto_scheme_Proportion(){
memcpy_All_int(scheme_x, HybridAuto.scheme_x->ptr, HybridAuto.scheme_x->pitch, D2H, nx+1, ny, nz);
memcpy_All_int(scheme_y, HybridAuto.scheme_y->ptr, HybridAuto.scheme_y->pitch, D2H, nx, ny+1, nz);
memcpy_All_int(scheme_z, HybridAuto.scheme_z->ptr, HybridAuto.scheme_z->pitch, D2H, nx, ny, nz+1);
double type1 = 0.0, type2 = 0.0, type3 = 0.0;
double Sum_type1, Sum_type2, Sum_type3;
int tmp = (nx + 1) * ny * nz;
for(int i = 0; i < tmp; i++){
if(*(scheme_x + i) == 1){
type1 += 1.0;
}else if(*(scheme_x + i) == 2){
type2 += 1.0;
}else{
type3 += 1.0;
}
}
type1 /= NY_GLOBAL*NZ_GLOBAL;
type2 /= NY_GLOBAL*NZ_GLOBAL;
type3 /= NY_GLOBAL*NZ_GLOBAL;
MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
tmp = NX_GLOBAL + NPX0;
char scheme_percent[] = "The first type scheme of Hybrid schemes in direction %s is \033[34m%lf%\033[0m, second is"
"\033[34m%lf%\033[0m, third is \033[34m%lf%\033[0m\n";
if(my_id == 0) printf(scheme_percent, "X", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);
type1 = 0.0; type2 = 0.0; type3 = 0.0;
tmp = nx * (ny + 1) * nz;
for(int i = 0; i < tmp; i++){
if(*(scheme_y + i) == 1){
type1 += 1.0;
}else if(*(scheme_y + i) == 2){
type2 += 1.0;
}else{
type3 += 1.0;
}
}
type1 /= NX_GLOBAL*NZ_GLOBAL;
type2 /= NX_GLOBAL*NZ_GLOBAL;
type3 /= NX_GLOBAL*NZ_GLOBAL;
MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
tmp = (NY_GLOBAL + NPY0);
if(my_id == 0) printf(scheme_percent, "Y", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);
type1 = 0.0; type2 = 0.0; type3 = 0.0;
tmp = nx * ny * (nz + 1);
for(int i = 0; i < tmp; i++){
if(*(scheme_z + i) == 1){
type1 += 1.0;
}else if(*(scheme_z + i) == 2){
type2 += 1.0;
}else{
type3 += 1.0;
}
}
type1 /= NX_GLOBAL*NY_GLOBAL;
type2 /= NX_GLOBAL*NY_GLOBAL;
type3 /= NX_GLOBAL*NY_GLOBAL;
MPI_Reduce(&type1, &Sum_type1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&type2, &Sum_type2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&type3, &Sum_type3, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
tmp = (NZ_GLOBAL + NPZ0);
if(my_id == 0) printf(scheme_percent, "Z", Sum_type1/tmp, Sum_type2/tmp, Sum_type3/tmp);
}
__device__ int get_Hyscheme_flag_p_kernel(int flagxyz, dim3 coords, cudaField_int scheme, cudaJobPackage job){
unsigned int x = coords.x + job.start.x;
unsigned int y = coords.y + job.start.y;
unsigned int z = coords.z + job.start.z;
int Hyscheme_flag;
switch(flagxyz){
case 1:
case 4:
Hyscheme_flag = *(scheme.ptr + (x + 1 - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
return Hyscheme_flag;
break;
case 2:
case 5:
Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y + 1 - LAP + (z - LAP)*(ny_d + 1))));
return Hyscheme_flag;
break;
case 3:
case 6:
Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z + 1 - LAP)*ny_d)));
return Hyscheme_flag;
break;
}
return 0;
}
__global__ void OCFD_HybridAuto_P_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int Hyscheme_flag;
int ia1 = -3; int ib1 = 4;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
if(i == 0) Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0){
if(Hyscheme_flag == 1){
tmp_r = OCFD_OMP6_kernel_P(0, &stencil[0]);
}else if(Hyscheme_flag == 2){
tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
}else{
tmp_r = OCFD_NND2_kernel_P(&stencil[2]);
}
}
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
__global__ void OCFD_HybridAuto_P_Jameson_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int Hyscheme_flag;
int ia1 = -3; int ib1 = 4;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
if(i == 0) Hyscheme_flag = get_Hyscheme_flag_p_kernel(flagxyzb.x, coords, scheme, job);
flag = OCFD_bound_scheme_kernel_p(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0){
if(Hyscheme_flag == 1){
tmp_r = OCFD_UP7_kernel_P(&stencil[0]);
}else if(Hyscheme_flag == 2){
tmp_r = OCFD_weno7_kernel_P(&stencil[0]);
}else{
tmp_r = OCFD_weno5_kernel_P(&stencil[1]);
}
}
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_p_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
__device__ int get_Hyscheme_flag_m_kernel(int flagxyz, dim3 coords, cudaField_int scheme, cudaJobPackage job){
unsigned int x = coords.x + job.start.x;
unsigned int y = coords.y + job.start.y;
unsigned int z = coords.z + job.start.z;
int Hyscheme_flag;
switch(flagxyz){
case 1:
case 4:
Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
return Hyscheme_flag;
break;
case 2:
case 5:
Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*(ny_d + 1))));
return Hyscheme_flag;
break;
case 3:
case 6:
Hyscheme_flag = *(scheme.ptr + (x - LAP + scheme.pitch *(y - LAP + (z - LAP)*ny_d)));
return Hyscheme_flag;
break;
}
return 0;
}
__global__ void OCFD_HybridAuto_M_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int Hyscheme_flag;
int ia1 = -4; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
if(i == 0) Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0){
if(Hyscheme_flag == 1){
tmp_r = OCFD_OMP6_kernel_M(0, &stencil[0]);
}else if(Hyscheme_flag == 2){
tmp_r = OCFD_weno7_kernel_M(&stencil[1]);
}else{
tmp_r = OCFD_NND2_kernel_M(&stencil[3]);
}
}
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
__global__ void OCFD_HybridAuto_M_Jameson_kernel(int i, dim3 flagxyzb, cudaSoA f, cudaSoA du, cudaField Ajac, cudaField_int scheme, cudaJobPackage job){
extern __shared__ REAL sort[];
dim3 coords;
REAL stencil[8];
int Hyscheme_flag;
int ia1 = -4; int ib1 = 3;
int flag = get_data_kernel(flagxyzb.x, &coords, f, i, &stencil[0], ia1, ib1, sort, job);
if(flag != 0){
REAL tmp_r, tmp_l;
if(i == 0) Hyscheme_flag = get_Hyscheme_flag_m_kernel(flagxyzb.x, coords, scheme, job);
flag = OCFD_bound_scheme_kernel_m(&tmp_r, flagxyzb, coords, &stencil[0], ia1, ib1, job);
if(flag != 0){
if(Hyscheme_flag == 1){
tmp_r = OCFD_UP7_kernel_M(&stencil[0]);
}else if(Hyscheme_flag == 2){
tmp_r = OCFD_weno7_kernel_M(&stencil[1]);
}else{
tmp_r = OCFD_weno5_kernel_M(&stencil[2]);
}
}
tmp_l = __shfl_up_double(tmp_r, 1, warpSize);
if(threadIdx.x != 0) put_du_m_kernel(flagxyzb, coords, tmp_r, tmp_l, du, i, Ajac, job);
}
}
#ifdef __cplusplus
}
#endif
#include <stdio.h>
#include <stdlib.h>
#include "OCFD_Stream.h"
#include "OCFD_split.h"
#include "OCFD_NS_Jacobian3d.h"
#include "parameters.h"
#include "OCFD_mpi_dev.h"
#include "parameters_d.h"
#include "commen_kernel.h"
#include "OCFD_Schemes_hybrid_auto.h"
#ifdef __cplusplus
extern "C" {
#endif
//static cudaStream_t Stream[15];
void opencfd_mem_init_Stream(){
for (int i = 0; i < 4; i++) cudaStreamCreate(&Stream[i]);
for (int i = 0; i < 4; i++) cudaEventCreate(&Event[i]);
}
void opencfd_mem_finalize_Stream(){
for (int i = 0; i < 4; ++i) cudaStreamDestroy(Stream[i]);
for (int i = 0; i < 4; ++i) cudaEventDestroy(Event[i]);
}
void du_comput(int KRK){
//pthread_create(&thread_handles[0], NULL, du_invis_Jacobian3d_inner, NULL);
//pthread_create(&thread_handles[1], NULL, du_vis_Jacobian3d_outer, NULL);
//for(int thread = 0; thread < 2; thread++)
// pthread_join(thread_handles[thread], NULL);
if(IFLAG_HybridAuto == 1 && KRK == 1) Set_Scheme_HybridAuto(&Stream[0]);
cuda_mem_value_init_warp(0.0 ,pdu_d->ptr, pdu_d->pitch, nx, ny, nz*5);
switch(Stream_MODE){
case 0://Non-stream
du_invis_Jacobian3d(NULL);
du_vis_Jacobian3d(NULL);
break;
case 1://launch: first invis, then vis
//du_invis_Jacobian3d_all(NULL);
//du_vis_Jacobian3d_all(NULL);
du_Jacobian3d_all(NULL);
break;
default:
if(my_id == 0) printf("\033[31mWrong Stream Mode! Please choose 0 or 1, 0:non stream; 1:stream\033[0m\n");
}
}
/*
void *du_Jacobian3d_all(void* pthread_id){
cudaJobPackage job(dim3(2*LAP, 2*LAP, 2*LAP), dim3(nx, ny, nz));
//cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, nz_lap));
du_invis_Jacobian3d_init(job, &Stream[0]);//内区声速计算
job.setup(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, &Stream[0]);
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, &Stream[0]);//内区无粘项计算
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, &Stream[0]);
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, &Stream[0]);
du_invis_Jacobian3d_outer_exchange(&Stream[2]);//交换原始变量
cudaDeviceSynchronize();
du_invis_Jacobian3d_outer_x(&Stream[1]);//外区计算
du_invis_Jacobian3d_outer_y(&Stream[1]);
du_invis_Jacobian3d_outer_z(&Stream[1]);
cudaDeviceSynchronize();
du_viscous_Jacobian3d_init(&Stream[2]);//开始算粘性项全体导数
cudaDeviceSynchronize();
du_viscous_Jacobian3d_x_init(&Stream[2]);//粘性项计算
du_vis_Jacobian3d_inner_x(&Stream[2]);//内区开始计算
cudaDeviceSynchronize();
du_vis_Jacobian3d_outer_x(&Stream[3]);//外区x计算
cudaDeviceSynchronize();
du_viscous_Jacobian3d_y_init(&Stream[2]);
du_vis_Jacobian3d_inner_y(&Stream[2]);//内区开始计算
cudaDeviceSynchronize();
du_vis_Jacobian3d_outer_y(&Stream[3]);//外区x计算
cudaDeviceSynchronize();
du_viscous_Jacobian3d_z_init(&Stream[2]);
du_vis_Jacobian3d_inner_z(&Stream[2]);//内区开始计算
cudaDeviceSynchronize();
du_vis_Jacobian3d_outer_z(&Stream[3]);//外区x计算
return NULL;
}
*/
void *du_Jacobian3d_all(void* pthread_id){
cudaJobPackage job(dim3(2*LAP, 2*LAP, 2*LAP), dim3(nx, ny, nz));
du_invis_Jacobian3d_init(job, &Stream[0]);//内区声速计算
job.setup(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, &Stream[0]);
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, &Stream[0]);//内区无粘项计算
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, &Stream[0]);
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, &Stream[0]);
du_invis_Jacobian3d_outer_exchange(&Stream[1]);//交换原始变量
cudaEventRecord(Event[1], Stream[1]);//记录数据交换情况
cudaStreamWaitEvent(Stream[2], Event[1], 0);//外区等待粘性计算完
du_invis_Jacobian3d_outer_x(&Stream[1]);//外区计算
du_viscous_Jacobian3d_init(&Stream[2]);//开始算粘性项全体导数
du_viscous_Jacobian3d_x_init(&Stream[2]);//粘性项计算
cudaEventRecord(Event[2], Stream[2]);//记录粘性计算
du_vis_Jacobian3d_inner_x(&Stream[2]);//内区开始计算
cudaStreamWaitEvent(Stream[3], Event[2], 0);//外区等待粘性计算完
du_vis_Jacobian3d_outer_x(&Stream[3]);//外区x计算
cudaEventRecord(Event[3], Stream[3]);
du_invis_Jacobian3d_outer_y(&Stream[1]);
cudaStreamWaitEvent(Stream[2], Event[3], 0);
du_viscous_Jacobian3d_y_init(&Stream[2]);
cudaEventRecord(Event[2], Stream[2]);//记录粘性计算
du_vis_Jacobian3d_inner_y(&Stream[2]);//内区开始计算
cudaStreamWaitEvent(Stream[3], Event[2], 0);//外区等待粘性计算完
du_vis_Jacobian3d_outer_y(&Stream[3]);//外区x计算
cudaEventRecord(Event[3], Stream[3]);
du_invis_Jacobian3d_outer_z(&Stream[1]);
cudaStreamWaitEvent(Stream[2], Event[3], 0);
du_viscous_Jacobian3d_z_init(&Stream[2]);
cudaEventRecord(Event[2], Stream[2]);//记录粘性计算
du_vis_Jacobian3d_inner_z(&Stream[2]);//内区开始计算
cudaStreamWaitEvent(Stream[3], Event[2], 0);
du_vis_Jacobian3d_outer_z(&Stream[3]);//外区x计算
return NULL;
}
/*void* du_invis_Jacobian3d_all(void* pthread_id){
cudaJobPackage job(dim3(2*LAP, 2*LAP, 2*LAP), dim3(nx, ny, nz));
du_invis_Jacobian3d_init(job, &Stream[0]);
job.setup(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
//direction X ------------------------------
du_invis_Jacobian3d_x(job, pfp_d, pfm_d, &Stream[0]);
du_invis_Jacobian3d_outer_x(&Stream[1]);
//direction Y ------------------------------
cudaEventRecord(Event[0], Stream[0]);
cudaEventRecord(Event[1], Stream[1]);
cudaStreamWaitEvent(Stream[0], Event[1], 0);
du_invis_Jacobian3d_y(job, pfp_d, pfm_d, &Stream[0]);
du_invis_Jacobian3d_outer_y(&Stream[1], &Event[0]);
//direction Z ------------------------------
cudaEventRecord(Event[0], Stream[0]);
cudaEventRecord(Event[1], Stream[1]);
cudaStreamWaitEvent(Stream[0], Event[1], 0);
du_invis_Jacobian3d_z(job, pfp_d, pfm_d, &Stream[0]);
du_invis_Jacobian3d_outer_z(&Stream[1], &Event[0]);
cudaEventRecord(Event[1], Stream[1]);
return NULL;
}*/
//void* du_vis_Jacobian3d_all(void* pthread_id){
//
// cudaStreamWaitEvent(Stream[2], Event[1], 0);
// du_viscous_Jacobian3d_init(&Stream[2]);
//
// //direction X ------------------------------
//
// du_viscous_Jacobian3d_x_init(&Stream[2]);
// cudaEventRecord(Event[2], Stream[2]);
// du_vis_Jacobian3d_inner_x(&Stream[2]);
// cudaStreamWaitEvent(Stream[1], Event[2], 0);
// du_vis_Jacobian3d_outer_x(&Stream[1]);
//
// //direction Y ------------------------------
//
// cudaEventRecord(Event[2], Stream[1]);
// cudaStreamWaitEvent(Stream[2], Event[2], 0);
// du_viscous_Jacobian3d_y_init(&Stream[2]);
// cudaEventRecord(Event[2], Stream[2]);
// du_vis_Jacobian3d_inner_y(&Stream[2]);
// cudaStreamWaitEvent(Stream[1], Event[2], 0);
// du_vis_Jacobian3d_outer_y(&Stream[1]);
//
// //direction X ------------------------------
//
// cudaEventRecord(Event[2], Stream[1]);
// cudaStreamWaitEvent(Stream[2], Event[2], 0);
// du_viscous_Jacobian3d_z_init(&Stream[2]);
// cudaEventRecord(Event[2], Stream[2]);
// du_vis_Jacobian3d_inner_z(&Stream[2]);
// cudaStreamWaitEvent(Stream[1], Event[2], 0);
// du_vis_Jacobian3d_outer_z(&Stream[1]);
//
//
// return NULL;
//}
void* du_vis_Jacobian3d_all(void* pthread_id){
du_viscous_Jacobian3d_init(&Stream[2]);
//direction X ------------------------------
du_viscous_Jacobian3d_x_init(&Stream[2]);
cudaEventRecord(Event[2], Stream[2]);
du_vis_Jacobian3d_inner_x(&Stream[2]);
cudaStreamWaitEvent(Stream[3], Event[1], 0);
cudaStreamWaitEvent(Stream[3], Event[2], 0);
du_vis_Jacobian3d_outer_x(&Stream[3]);
//direction Y ------------------------------
cudaEventRecord(Event[2], Stream[3]);
cudaStreamWaitEvent(Stream[2], Event[2], 0);
du_viscous_Jacobian3d_y_init(&Stream[2]);
cudaEventRecord(Event[2], Stream[2]);
du_vis_Jacobian3d_inner_y(&Stream[2]);
cudaStreamWaitEvent(Stream[3], Event[2], 0);
du_vis_Jacobian3d_outer_y(&Stream[3]);
//direction Z ------------------------------
cudaEventRecord(Event[2], Stream[3]);
cudaStreamWaitEvent(Stream[2], Event[2], 0);
du_viscous_Jacobian3d_z_init(&Stream[2]);
cudaEventRecord(Event[2], Stream[2]);
du_vis_Jacobian3d_inner_z(&Stream[2]);
cudaStreamWaitEvent(Stream[3], Event[2], 0);
du_vis_Jacobian3d_outer_z(&Stream[3]);
return NULL;
}
void* du_vis_Jacobian3d_inner_x(cudaStream_t *stream){
cudaJobPackage job(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
du_viscous_Jacobian3d_x_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_inner_y(cudaStream_t *stream){
cudaJobPackage job(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
du_viscous_Jacobian3d_y_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_inner_z(cudaStream_t *stream){
cudaJobPackage job(dim3(3*LAP, 3*LAP, 3*LAP), dim3(nx-LAP, ny-LAP, nz-LAP));
du_viscous_Jacobian3d_z_final(job, stream);
return NULL;
}
void* du_invis_Jacobian3d(void* pthread_id){
exchange_boundary_xyz_packed_dev(pd , pd_d);
exchange_boundary_xyz_packed_dev(pu , pu_d);
exchange_boundary_xyz_packed_dev(pv , pv_d);
exchange_boundary_xyz_packed_dev(pw , pw_d);
exchange_boundary_xyz_packed_dev(pT , pT_d);
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, nz_lap));
du_invis_Jacobian3d_init(job, &Stream[0]);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, &Stream[0]);
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, &Stream[0]);
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, &Stream[0]);
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, &Stream[0]);
return NULL;
}
void* du_vis_Jacobian3d(void* pthread_id){
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, nz_lap));
du_viscous_Jacobian3d_init(&Stream[0]);
du_viscous_Jacobian3d_x_init(&Stream[0]);
exchange_boundary_x_packed_dev(pEv1 , pEv1_d , Iperiodic[0]);
exchange_boundary_x_packed_dev(pEv2 , pEv2_d , Iperiodic[0]);
exchange_boundary_x_packed_dev(pEv3 , pEv3_d , Iperiodic[0]);
exchange_boundary_x_packed_dev(pEv4 , pEv4_d , Iperiodic[0]);
du_viscous_Jacobian3d_x_final(job, &Stream[0]);
du_viscous_Jacobian3d_y_init(&Stream[0]);
exchange_boundary_y_packed_dev(pEv1 , pEv1_d , Iperiodic[1]);
exchange_boundary_y_packed_dev(pEv2 , pEv2_d , Iperiodic[1]);
exchange_boundary_y_packed_dev(pEv3 , pEv3_d , Iperiodic[1]);
exchange_boundary_y_packed_dev(pEv4 , pEv4_d , Iperiodic[1]);
boundary_symmetry_pole_vis_y(&Stream[0]);
du_viscous_Jacobian3d_y_final(job, &Stream[0]);
du_viscous_Jacobian3d_z_init(&Stream[0]);
exchange_boundary_z_packed_dev(pEv1 , pEv1_d ,Iperiodic[2]);
exchange_boundary_z_packed_dev(pEv2 , pEv2_d ,Iperiodic[2]);
exchange_boundary_z_packed_dev(pEv3 , pEv3_d ,Iperiodic[2]);
exchange_boundary_z_packed_dev(pEv4 , pEv4_d ,Iperiodic[2]);
du_viscous_Jacobian3d_z_final(job, &Stream[0]);
return NULL;
}
void* du_invis_Jacobian3d_outer_init_x(cudaStream_t *stream){
//-------------x outer p init----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_invis_Jacobian3d_init(job, stream);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
//-------------x outer m init----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_invis_Jacobian3d_init(job, stream);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_x_x(cudaStream_t *stream){
//-------------x outer p x----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
//-------------x outer m x----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_y_x(cudaStream_t *stream){
//-------------x outer p y----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
//-------------x outer m y----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_z_x(cudaStream_t *stream){
//-------------x outer p z----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
//-------------x outer m z----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_init_y(cudaStream_t *stream){
//-------------y outer p init----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_invis_Jacobian3d_init(job, stream);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
//-------------y outer m init----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_invis_Jacobian3d_init(job, stream);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_x_y(cudaStream_t *stream){
//-------------y outer p x----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
//-------------y outer m x----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_y_y(cudaStream_t *stream){
//-------------y outer p y----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
//-------------y outer m y----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_z_y(cudaStream_t *stream){
//-------------y outer p z----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
//-------------y outer m z----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_init_z(cudaStream_t *stream){
//-------------z outer p init----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_invis_Jacobian3d_init(job, stream);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
//-------------z outer m init----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_invis_Jacobian3d_init(job, stream);
Stager_Warming(job, pfp_x_d, pfm_x_d, pfp_y_d, pfm_y_d, pfp_z_d, pfm_z_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_x_z(cudaStream_t *stream){
//-------------z outer p x----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
//-------------z outer m x----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_invis_Jacobian3d_x(job, pfp_x_d, pfm_x_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_y_z(cudaStream_t *stream){
//-------------z outer p----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
//-------------z outer m----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_invis_Jacobian3d_y(job, pfp_y_d, pfm_y_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_z_z(cudaStream_t *stream){
//-------------z outer p----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
//-------------z outer m----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_invis_Jacobian3d_z(job, pfp_z_d, pfm_z_d, stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_exchange(cudaStream_t *stream){
exchange_boundary_xyz_Async_packed_dev(pd , pd_d , stream);
exchange_boundary_xyz_Async_packed_dev(pu , pu_d , stream);
exchange_boundary_xyz_Async_packed_dev(pv , pv_d , stream);
exchange_boundary_xyz_Async_packed_dev(pw , pw_d , stream);
exchange_boundary_xyz_Async_packed_dev(pT , pT_d , stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_x(cudaStream_t *stream){
du_invis_Jacobian3d_outer_init_x(stream);
du_invis_Jacobian3d_outer_x_x(stream);
du_invis_Jacobian3d_outer_y_x(stream);
du_invis_Jacobian3d_outer_z_x(stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_y(cudaStream_t *stream){
du_invis_Jacobian3d_outer_init_y(stream);
du_invis_Jacobian3d_outer_x_y(stream);
du_invis_Jacobian3d_outer_y_y(stream);
du_invis_Jacobian3d_outer_z_y(stream);
return NULL;
}
void* du_invis_Jacobian3d_outer_z(cudaStream_t *stream){
du_invis_Jacobian3d_outer_init_z(stream);
du_invis_Jacobian3d_outer_x_z(stream);
du_invis_Jacobian3d_outer_y_z(stream);
du_invis_Jacobian3d_outer_z_z(stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_x_x(cudaStream_t *stream){
//-------------x outer p x----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_viscous_Jacobian3d_x_final(job, stream);
//-------------x outer m x----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_viscous_Jacobian3d_x_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_y_x(cudaStream_t *stream){
//-------------x outer p y----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_viscous_Jacobian3d_y_final(job, stream);
//-------------x outer m y----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_viscous_Jacobian3d_y_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_z_x(cudaStream_t *stream){
//-------------x outer p z----------------
cudaJobPackage job(dim3(LAP, 3*LAP, 3*LAP), dim3(3*LAP, ny-LAP, nz-LAP));
du_viscous_Jacobian3d_z_final(job, stream);
//-------------x outer m z----------------
job.start.x = nx-LAP;
job.end.x = nx_lap;
du_viscous_Jacobian3d_z_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_x_y(cudaStream_t *stream){
//-------------y outer p x----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_viscous_Jacobian3d_x_final(job, stream);
//-------------y outer m x----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_viscous_Jacobian3d_x_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_y_y(cudaStream_t *stream){
//-------------y outer p y----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_viscous_Jacobian3d_y_final(job, stream);
//-------------y outer m y----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_viscous_Jacobian3d_y_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_z_y(cudaStream_t *stream){
//-------------y outer p z----------------
cudaJobPackage job(dim3(LAP, LAP, 3*LAP), dim3(nx_lap, 3*LAP, nz-LAP));
du_viscous_Jacobian3d_z_final(job, stream);
//-------------y outer m z----------------
job.start.y = ny-LAP;
job.end.y = ny_lap;
du_viscous_Jacobian3d_z_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_x_z(cudaStream_t *stream){
//-------------z outer p x----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_viscous_Jacobian3d_x_final(job, stream);
//-------------z outer m x----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_viscous_Jacobian3d_x_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_y_z(cudaStream_t *stream){
//-------------z outer p y----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_viscous_Jacobian3d_y_final(job, stream);
//-------------z outer m y----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_viscous_Jacobian3d_y_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_z_z(cudaStream_t *stream){
//-------------z outer p z----------------
cudaJobPackage job(dim3(LAP, LAP, LAP), dim3(nx_lap, ny_lap, 3*LAP));
du_viscous_Jacobian3d_z_final(job, stream);
//-------------z outer m z----------------
job.start.z = nz-LAP;
job.end.z = nz_lap;
du_viscous_Jacobian3d_z_final(job, stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_x(cudaStream_t *stream){
exchange_boundary_x_Async_packed_dev(pEv1 , pEv1_d , Iperiodic[0], stream);
exchange_boundary_x_Async_packed_dev(pEv2 , pEv2_d , Iperiodic[0], stream);
exchange_boundary_x_Async_packed_dev(pEv3 , pEv3_d , Iperiodic[0], stream);
exchange_boundary_x_Async_packed_dev(pEv4 , pEv4_d , Iperiodic[0], stream);
du_vis_Jacobian3d_outer_x_x(stream);
du_vis_Jacobian3d_outer_x_y(stream);
du_vis_Jacobian3d_outer_x_z(stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_y(cudaStream_t *stream){
exchange_boundary_y_Async_packed_dev(pEv1 , pEv1_d , Iperiodic[1], stream);
exchange_boundary_y_Async_packed_dev(pEv2 , pEv2_d , Iperiodic[1], stream);
exchange_boundary_y_Async_packed_dev(pEv3 , pEv3_d , Iperiodic[1], stream);
exchange_boundary_y_Async_packed_dev(pEv4 , pEv4_d , Iperiodic[1], stream);
boundary_symmetry_pole_vis_y(stream);
du_vis_Jacobian3d_outer_y_x(stream);
du_vis_Jacobian3d_outer_y_y(stream);
du_vis_Jacobian3d_outer_y_z(stream);
return NULL;
}
void* du_vis_Jacobian3d_outer_z(cudaStream_t *stream){
exchange_boundary_z_Async_packed_dev(pEv1 , pEv1_d , Iperiodic[2], stream);
exchange_boundary_z_Async_packed_dev(pEv2 , pEv2_d , Iperiodic[2], stream);
exchange_boundary_z_Async_packed_dev(pEv3 , pEv3_d , Iperiodic[2], stream);
exchange_boundary_z_Async_packed_dev(pEv4 , pEv4_d , Iperiodic[2], stream);
du_vis_Jacobian3d_outer_z_x(stream);
du_vis_Jacobian3d_outer_z_y(stream);
du_vis_Jacobian3d_outer_z_z(stream);
return NULL;
}
#ifdef __cplusplus
}
#endif
#include "OCFD_ana.h"
#ifdef __cplusplus
extern "C"{
#endif
__global__ void get_inner_kernel(cudaField x1, cudaField x2, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field(x1, x-LAP, y-LAP, z-LAP) = get_Field_LAP(x2, x, y, z);
}
}
void get_inner(cudaField x1, cudaField x2){
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
get_inner_kernel<<<griddim , blockdim>>>(x1, x2, job);
}
void ana_Jac(){
// check NAN in d u v w T
// check Negative T
int i,j,k,flag = 0;
unsigned long int offset;
memcpy_All(pAjac , pAjac_d->ptr , pAjac_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
for(k=0;k<nz_2lap;k++){
for(j=0;j<ny_2lap;j++){
for(i=0;i<nx_2lap;i++){
offset = i + nx_2lap*(j + k*ny_2lap);
if( *(pAjac + offset) < 0 ){
printf("\033[31mNegative Jac occured in %d , %d , %d\033[0m\n",
i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
flag = 1;
//goto end_Jac;
}
}
}
}
//end_Jac:;
if(flag == 1) exit(0);
}
__global__ void add_E_kernel(cudaField pE, int SMEMDIM, REAL *g_odata, cudaJobPackage job){
extern __shared__ REAL shared[];
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int Id = blockDim.x * blockDim.y * threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x;
unsigned int warpId = Id / warpSize;
unsigned int laneIdx = Id % warpSize;
REAL grad_f0 = 0.;
if(x < job.end.x && y < job.end.y && z < job.end.z){
grad_f0 = get_Field(pE, x, y, z);
}
grad_f0 = warpReduce(grad_f0);
if(laneIdx == 0) shared[warpId] = grad_f0;
__syncthreads();
grad_f0 = (Id < SMEMDIM)?shared[Id]:0;
if(warpId == 0) grad_f0 = warpReduce(grad_f0);
if(Id == 0) g_odata[blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z] = grad_f0;
}
void ana_residual(cudaField PE_d, REAL *E0){
dim3 size, griddim, blockdim;
cudaJobPackage job(dim3(0, 0, 0), dim3(nx, ny, nz));
jobsize(&job, &size);
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, size.x, size.y, size.z);
REAL *g_odata;
REAL *Sum = (REAL *)malloc(sizeof(REAL));
unsigned int g_odata_size = griddim.x*griddim.y*griddim.z;
CUDA_LAUNCH(( cudaMalloc((REAL **)&g_odata, g_odata_size*sizeof(REAL)) ));
int SMEMDIM = blockdim.x*blockdim.y*blockdim.z/64; //Warpsize is 64
CUDA_LAUNCH((add_E_kernel<<<griddim, blockdim, SMEMDIM>>>(PE_d, SMEMDIM, g_odata, job)));
dim3 blockdim_sum(512);
dim3 griddim_sum(g_odata_size);
do{
griddim_sum.x = (griddim_sum.x + blockdim_sum.x - 1)/blockdim_sum.x;
CUDA_LAUNCH(( add_kernel<<<griddim_sum, blockdim_sum, 8>>>(g_odata, g_odata_size) ));
} while(griddim_sum.x > 1);
CUDA_LAUNCH(( cudaMemcpy(Sum, g_odata, sizeof(REAL), cudaMemcpyDeviceToHost) ));
CUDA_LAUNCH(( cudaFree(g_odata) ));
MPI_Allreduce(Sum, E0, 1, OCFD_DATA_TYPE, MPI_SUM, MPI_COMM_WORLD);
*E0 /= NX_GLOBAL * NY_GLOBAL * NZ_GLOBAL;
}
void ana_NAN_and_NT(){
// check NAN in d u v w T
// check Negative T
//if(N_ana < 0 || Istep % Kstep_ana != 0) return;
int i,j,k;
unsigned long int offset;
unsigned int n_NT_limit = 10;
char has_nan = 0;
unsigned long int n_NT = 0;
//memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
//memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
//memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
//memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
if(my_id == 0) printf("It is analyzing NAN......\n");
//for(k=0;k<nz_2lap;k++){
// for(j=0;j<ny_2lap;j++){
// for(i=0;i<nx_2lap;i++){
// offset = i + nx_2lap*(j + k*ny_2lap);
// if( isnan( *(pd + offset) ) ){
// has_nan = 1;
// printf("\033[31mNAN occured in d(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
// goto end_d;
// }
// }
// }
//}
//end_d:;
//for(k=0;k<nz_2lap;k++){
// for(j=0;j<ny_2lap;j++){
// for(i=0;i<nx_2lap;i++){
// offset = i + nx_2lap*(j + k*ny_2lap);
// if( isnan( *(pu + offset) ) ){
// has_nan = 1;
// printf("\033[31mNAN occured in u(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
// goto end_u;
// }
// }
// }
//}
//end_u:;
//for(k=0;k<nz_2lap;k++){
// for(j=0;j<ny_2lap;j++){
// for(i=0;i<nx_2lap;i++){
// offset = i + nx_2lap*(j + k*ny_2lap);
// if( isnan( *(pv + offset) ) ){
// has_nan = 1;
// printf("\033[31mNAN occured in v(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
// goto end_v;
// }
// }
// }
//}
//end_v:;
//for(k=0;k<nz_2lap;k++){
// for(j=0;j<ny_2lap;j++){
// for(i=0;i<nx_2lap;i++){
// offset = i + nx_2lap*(j + k*ny_2lap);
// if( isnan( *(pw + offset) ) ){
// has_nan = 1;
// printf("\033[31mNAN occured in w(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
// goto end_w;
// }
// }
// }
//}
//end_w:;
for(k=0;k<nz_2lap;k++){
for(j=0;j<ny_2lap;j++){
for(i=0;i<nx_2lap;i++){
offset = i + nx_2lap*(j + k*ny_2lap);
if( isnan( *(pT + offset) ) ){
has_nan = 1;
//printf("\033[31mNAN occured in T(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
printf("\033[31mNAN occured in Global ID(%d , %d , %d)\033[0m\n\n",i_offset[npx]+i-LAP,j_offset[npy]+j-LAP,k_offset[npz]+k-LAP);
goto end_T;
}
}
}
}
end_T:;
for(k=0;k<nz_2lap;k++){
for(j=0;j<ny_2lap;j++){
for(i=0;i<nx_2lap;i++){
offset = i + nx_2lap*(j + k*ny_2lap);
if( *(pT + offset) < 0 ){
n_NT++;
//printf("\033[31mNegative T occured in T(%d , %d , %d)\033[0m\non Proc(%d , %d , %d) , global Idx(%d , %d , %d)\n\n",i,j,k,npx,npy,npz , i_offset[npx]+i,j_offset[npy]+j,k_offset[npz]+k);
printf("\033[31mNegative T occured in Global ID(%d , %d , %d)\033[0m\n\n",i_offset[npx]+i-LAP,j_offset[npy]+j-LAP,k_offset[npz]+k-LAP);
}
}
}
}
if(n_NT > n_NT_limit){
printf("\033[31mNegative T points %ld > %d\033[0m on Proc(%d , %d , %d)\033[0m\n",n_NT , n_NT_limit,npx,npy,npz);
MPI_Abort(MPI_COMM_WORLD , 1);
}
if( has_nan ){
if(my_id == 0) printf("\033[31mNAN occured , program Abort\033[0m\n");
MPI_Abort(MPI_COMM_WORLD , 1);
}
//cudaStreamDestroy(ana_NT_stream);
}
__global__ void init_time_average_kernel(cudaField d1, cudaField u1, cudaField v1, cudaField w1, cudaField T1, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
//REAL a = get_Field_LAP(d, x, y, z);
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(d1, x, y, z) = 0.;
get_Field_LAP(u1, x, y, z) = 0.;
get_Field_LAP(v1, x, y, z) = 0.;
get_Field_LAP(w1, x, y, z) = 0.;
get_Field_LAP(T1, x, y, z) = 0.;
}
}
__global__ void ana_time_average_kernel(cudaField d1, cudaField u1, cudaField v1, cudaField w1, cudaField T1,
cudaField d, cudaField u, cudaField v, cudaField w, cudaField T, int Istep, cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
//REAL a = get_Field_LAP(d, x, y, z);
if(x < job.end.x && y < job.end.y && z < job.end.z){
get_Field_LAP(d1, x, y, z) = (Istep * get_Field_LAP(d1, x, y, z) + get_Field_LAP(d, x, y, z))/(Istep + 1.);
get_Field_LAP(u1, x, y, z) = (Istep * get_Field_LAP(u1, x, y, z) + get_Field_LAP(u, x, y, z))/(Istep + 1.);
get_Field_LAP(v1, x, y, z) = (Istep * get_Field_LAP(v1, x, y, z) + get_Field_LAP(v, x, y, z))/(Istep + 1.);
get_Field_LAP(w1, x, y, z) = (Istep * get_Field_LAP(w1, x, y, z) + get_Field_LAP(w, x, y, z))/(Istep + 1.);
get_Field_LAP(T1, x, y, z) = (Istep * get_Field_LAP(T1, x, y, z) + get_Field_LAP(T, x, y, z))/(Istep + 1.);
}
}
void ana_time_average(){
if(my_id == 0) printf("It is averaging......\n");
if(average_IO == 1){
int tmp_size = (nx + 2 * LAP) * (ny + 2 * LAP) * (nz + 2 * LAP) * sizeof(REAL);
pdm = (REAL *)malloc_me(tmp_size);
pum = (REAL *)malloc_me(tmp_size);
pvm = (REAL *)malloc_me(tmp_size);
pwm = (REAL *)malloc_me(tmp_size);
pTm = (REAL *)malloc_me(tmp_size);
new_cudaField(&pdm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
new_cudaField(&pum_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
new_cudaField(&pvm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
new_cudaField(&pwm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
new_cudaField(&pTm_d , nx+2*LAP , ny+2*LAP , nz+2*LAP);
read_file(average_IO, pdm, pum, pvm, pwm, pTm);
}
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
cudaJobPackage job(dim3(LAP,LAP,LAP) , dim3(nx_lap,ny_lap,nz_lap));
CUDA_LAUNCH(( ana_time_average_kernel<<<griddim , blockdim>>>(*pdm_d, *pum_d, *pvm_d, *pwm_d, *pTm_d,
*pd_d, *pu_d, *pv_d, *pw_d, *pT_d, Istep_average, job) ));
Istep_average += 1;
tt_average += dt;
if(Istep%Kstep_save == 0){
memcpy_All(pdm , pdm_d->ptr , pdm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
memcpy_All(pum , pum_d->ptr , pum_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
memcpy_All(pvm , pvm_d->ptr , pvm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
memcpy_All(pwm , pwm_d->ptr , pwm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
memcpy_All(pTm , pTm_d->ptr , pTm_d->pitch , D2H , nx+2*LAP , ny+2*LAP , nz+2*LAP);
OCFD_save(1, Istep_average, pdm, pum, pvm, pwm, pTm);
}
if(tt == end_time){
free(pdm);
free(pum);
free(pvm);
free(pwm);
free(pTm);
delete_cudaField(pdm_d);
delete_cudaField(pum_d);
delete_cudaField(pvm_d);
delete_cudaField(pwm_d);
delete_cudaField(pTm_d);
}
}
void init_time_average(){
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , nz );
cudaJobPackage job(dim3(0,0,0) , dim3(nx_2lap,ny_2lap,nz_2lap));
CUDA_LAUNCH(( init_time_average_kernel<<<griddim , blockdim>>>(*pdm_d, *pum_d, *pvm_d, *pwm_d, *pTm_d, job) ));
}
__global__ void get_Q_kernal(
cudaField ui,
cudaField us,
cudaField uk,
cudaField vi,
cudaField vs,
cudaField vk,
cudaField wi,
cudaField ws,
cudaField wk,
cudaField Akx,
cudaField Aky,
cudaField Akz,
cudaField Aix,
cudaField Aiy,
cudaField Aiz,
cudaField Asx,
cudaField Asy,
cudaField Asz,
cudaField Ajac,
cudaField Q,
cudaJobPackage job){
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
REAL ux = get_Field(uk, x, y, z)*get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)+
get_Field(ui, x, y, z)*get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)+
get_Field(us, x, y, z)*get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
REAL vx = get_Field(vk, x, y, z)*get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)+
get_Field(vi, x, y, z)*get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)+
get_Field(vs, x, y, z)*get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
REAL wx = get_Field(wk, x, y, z)*get_Field_LAP(Akx, x+LAP, y+LAP, z+LAP)+
get_Field(wi, x, y, z)*get_Field_LAP(Aix, x+LAP, y+LAP, z+LAP)+
get_Field(ws, x, y, z)*get_Field_LAP(Asx, x+LAP, y+LAP, z+LAP);
REAL uy = get_Field(uk, x, y, z)*get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)+
get_Field(ui, x, y, z)*get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)+
get_Field(us, x, y, z)*get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
REAL vy = get_Field(vk, x, y, z)*get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)+
get_Field(vi, x, y, z)*get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)+
get_Field(vs, x, y, z)*get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
REAL wy = get_Field(wk, x, y, z)*get_Field_LAP(Aky, x+LAP, y+LAP, z+LAP)+
get_Field(wi, x, y, z)*get_Field_LAP(Aiy, x+LAP, y+LAP, z+LAP)+
get_Field(ws, x, y, z)*get_Field_LAP(Asy, x+LAP, y+LAP, z+LAP);
REAL uz = get_Field(uk, x, y, z)*get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)+
get_Field(ui, x, y, z)*get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)+
get_Field(us, x, y, z)*get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
REAL vz = get_Field(vk, x, y, z)*get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)+
get_Field(vi, x, y, z)*get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)+
get_Field(vs, x, y, z)*get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
REAL wz = get_Field(wk, x, y, z)*get_Field_LAP(Akz, x+LAP, y+LAP, z+LAP)+
get_Field(wi, x, y, z)*get_Field_LAP(Aiz, x+LAP, y+LAP, z+LAP)+
get_Field(ws, x, y, z)*get_Field_LAP(Asz, x+LAP, y+LAP, z+LAP);
get_Field_LAP(Q, x+LAP, y+LAP, z+LAP) = (ux*vy + ux*wz + vy*wz - uy*vx - uz*wx - vz*wy)*
get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP)*
get_Field_LAP(Ajac, x+LAP, y+LAP, z+LAP);
}
}
void get_Q(){
cudaField *ui; new_cudaField(&ui, nx, ny, nz);
cudaField *us; new_cudaField(&us, nx, ny, nz);
cudaField *uk; new_cudaField(&uk, nx, ny, nz);
cudaField *vi; new_cudaField(&vi, nx, ny, nz);
cudaField *vs; new_cudaField(&vs, nx, ny, nz);
cudaField *vk; new_cudaField(&vk, nx, ny, nz);
cudaField *wi; new_cudaField(&wi, nx, ny, nz);
cudaField *ws; new_cudaField(&ws, nx, ny, nz);
cudaField *wk; new_cudaField(&wk, nx, ny, nz);
cudaField *Q_d; new_cudaField(&Q_d, nx, ny, nz);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap, nz_lap) );
OCFD_dx0(*pu_d, *uk, job, BlockDim_X, &Stream[0], D0_bound[0], D0_bound[1]);
OCFD_dx0(*pv_d, *vk, job, BlockDim_X, &Stream[0], D0_bound[0], D0_bound[1]);
OCFD_dx0(*pw_d, *wk, job, BlockDim_X, &Stream[0], D0_bound[0], D0_bound[1]);
OCFD_dy0(*pu_d, *ui, job, BlockDim_Y, &Stream[0], D0_bound[2], D0_bound[3]);
OCFD_dy0(*pv_d, *vi, job, BlockDim_Y, &Stream[0], D0_bound[2], D0_bound[3]);
OCFD_dy0(*pw_d, *wi, job, BlockDim_Y, &Stream[0], D0_bound[2], D0_bound[3]);
OCFD_dz0(*pu_d, *us, job, BlockDim_Z, &Stream[0], D0_bound[4], D0_bound[5]);
OCFD_dz0(*pv_d, *vs, job, BlockDim_Z, &Stream[0], D0_bound[4], D0_bound[5]);
OCFD_dz0(*pw_d, *ws, job, BlockDim_Z, &Stream[0], D0_bound[4], D0_bound[5]);
dim3 griddim , blockdim;
cal_grid_block_dim(&griddim, &blockdim, BlockDimX, BlockDimY, BlockDimZ, nx, ny, nz);
job.setup( dim3(0,0,0) , dim3(nx,ny,nz) );
CUDA_LAUNCH(( get_Q_kernal<<<griddim, blockdim>>>(*ui,*us,*uk,*vi,*vs,*vk,*wi,*ws,*wk,*pAkx_d,
*pAky_d,*pAkz_d,*pAix_d,*pAiy_d,*pAiz_d,*pAsx_d,*pAsy_d,*pAsz_d,*pAjac_d,*Q_d,job) ));
memcpy_All(pP, Q_d->ptr, Q_d->pitch, D2H, nx_2lap, ny_2lap, nz_2lap);
MPI_File tmp_file;
MPI_File_open(MPI_COMM_WORLD, "Q.dat", MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &tmp_file);
write_3d1(tmp_file, 0, pP);
MPI_File_close(&tmp_file);
char filename[100];
FILE *fp;
sprintf(filename, "Q%02d%02d%02d.dat", npx, npy, npz);
fp = fopen(filename, "w");
fprintf(fp, "variables=x,y,z,Q\n");
fprintf(fp, "zone i=%d ,j=%d ,k=%d\n", nx, ny, nz);
for(int k = LAP; k < nz+LAP; k++){
for(int j = LAP; j < ny+LAP; j++){
for(int i = LAP; i < nx+LAP; i++){
fprintf(fp, "%15.6f%15.6f%15.6f%15.6f\n", *(pAxx+i+j*nx_2lap+k*nx_2lap*ny_2lap), *(pAyy+i+j*nx_2lap+k*nx_2lap*ny_2lap), *(pAzz+i+j*nx_2lap+k*nx_2lap*ny_2lap), *(pP+i+j*nx_2lap+k*nx_2lap*ny_2lap));
}
}
}
delete_cudaField(ui);
delete_cudaField(us);
delete_cudaField(uk);
delete_cudaField(vi);
delete_cudaField(vs);
delete_cudaField(vk);
delete_cudaField(wi);
delete_cudaField(ws);
delete_cudaField(wk);
delete_cudaField(Q_d);
exit(0);
}
void ana_saveplaneXY(int ID){
int point = ANA_npara[ID][0];
int bandwidth = ANA_npara[ID][1];
FILE *fp;
char fp_name[120];
memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
for(int i = 0; i < point; i++){
if(my_id == 0){
printf("Save data ...., %d, %lf, %d\n", Istep, tt, i);
sprintf(fp_name, "Savedata-XY%03d.dat", i);
fp = fopen(fp_name, "a");
int bytes = sizeof(REAL) + sizeof(int);
fwrite(&bytes, sizeof(int), 1, fp);
fwrite(&Istep, sizeof(int), 1, fp);
fwrite(&tt, sizeof(REAL), 1, fp);
fwrite(&bytes, sizeof(int), 1, fp);
}
for(int j = ANA_npara[ID][i+2]; j <= ANA_npara[ID][i+2]+bandwidth-1; j++){
write_2d_XYa(fp, j, pd);
write_2d_XYa(fp, j, pu);
write_2d_XYa(fp, j, pv);
write_2d_XYa(fp, j, pw);
write_2d_XYa(fp, j, pT);
}
if(my_id == 0) fclose(fp);
}
}
void ana_saveplaneYZ(int ID){
int point = ANA_npara[ID][0];
int bandwidth = ANA_npara[ID][1];
FILE *fp;
char fp_name[120];
memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
for(int i = 0; i < point; i++){
if(my_id == 0){
printf("Save data ...., %d, %lf, %d\n", Istep, tt, i);
sprintf(fp_name, "Savedata-YZ%03d.dat", i);
fp = fopen(fp_name, "a");
//fprintf(fp, "%d%lf\n", Istep, tt);
int bytes = sizeof(REAL) + sizeof(int);
fwrite(&bytes, sizeof(int), 1, fp);
fwrite(&Istep, sizeof(int), 1, fp);
fwrite(&tt, sizeof(REAL), 1, fp);
fwrite(&bytes, sizeof(int), 1, fp);
}
for(int j = ANA_npara[ID][i+2]; j <= ANA_npara[ID][i+2]+bandwidth-1; j++){
write_2d_YZa(fp, j, pd);
write_2d_YZa(fp, j, pu);
write_2d_YZa(fp, j, pv);
write_2d_YZa(fp, j, pw);
write_2d_YZa(fp, j, pT);
}
if(my_id == 0) fclose(fp);
}
}
void ana_saveplaneXZ(int ID){
int point = ANA_npara[ID][0];
int bandwidth = ANA_npara[ID][1];
FILE *fp;
char fp_name[120];
memcpy_All(pd , pd_d->ptr , pd_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pu , pu_d->ptr , pu_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pv , pv_d->ptr , pv_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pw , pw_d->ptr , pw_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
memcpy_All(pT , pT_d->ptr , pT_d->pitch , D2H , nx_2lap , ny_2lap , nz_2lap);
for(int i = 0; i < point; i++){
if(my_id == 0){
printf("Save data ...., %d, %lf, %d\n", Istep, tt, i);
sprintf(fp_name, "Savedata-XZ%03d.dat", i);
fp = fopen(fp_name, "a");
//fprintf(fp, "%d%lf\n", Istep, tt);
int bytes = sizeof(REAL) + sizeof(int);
fwrite(&bytes, sizeof(int), 1, fp);
fwrite(&Istep, sizeof(int), 1, fp);
fwrite(&tt, sizeof(REAL), 1, fp);
fwrite(&bytes, sizeof(int), 1, fp);
}
for(int j = ANA_npara[ID][i+2]; j <= ANA_npara[ID][i+2]+bandwidth-1; j++){
write_2d_XZa(fp, j, pd);
write_2d_XZa(fp, j, pu);
write_2d_XZa(fp, j, pv);
write_2d_XZa(fp, j, pw);
write_2d_XZa(fp, j, pT);
}
if(my_id == 0) fclose(fp);
}
}
void OCFD_ana(int style, int ID){
switch(style){
case 100:
ana_NAN_and_NT();
break;
case 101:
ana_time_average();
break;
case 102:
HybridAuto_scheme_IO();
break;
case 103:
get_Q();
break;
case 104:
ana_saveplaneXY(ID);
break;
case 105:
ana_saveplaneYZ(ID);
break;
case 106:
ana_saveplaneXZ(ID);
break;
case 107:
if(IFLAG_HybridAuto == 1) HybridAuto_scheme_Proportion();
break;
}
}
#ifdef __cplusplus
}
#endif
// boundary scheme
#include "parameters.h"
#include "utility.h"
#include "OCFD_Schemes.h"
#include "parameters_d.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#include "mpi.h"
#define PREPARE_x \
dim3 blockdim , griddim;\
cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , 1 , size.y , size.z);\
#define PREPARE_y \
dim3 blockdim , griddim;\
cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , size.x , 1 , size.z);\
#define PREPARE_z \
dim3 blockdim , griddim;\
cal_grid_block_dim(&griddim , &blockdim , blockdim_in.x , blockdim_in.y , blockdim_in.z , size.x , size.y , 1);\
#ifdef DEBUG_MODE
#define CHECK_SIZE(dir , call)\
if(size.dir >= LAP){\
PREPARE_ ##dir\
call;\
}else{\
printf("job_in.start." #dir " = %d , job_in.size." #dir " = %d\n",job_in.start.dir , size.dir);\
printf("illegal size , to launch %s , size." #dir " >= LAP (%d) is required\n" , __FUNCTION__ ,LAP);\
MPI_Abort(MPI_COMM_WORLD , 1);\
}
#else
#define CHECK_SIZE(dir , call) \
PREPARE_ ##dir\
call;
#endif
#define CHECK_X(callm , callp)\
dim3 size;\
jobsize(&job_in , &size);\
if(npx == 0 && job_in.start.x == LAP){\
CHECK_SIZE(x , callm)\
}\
if(npx == NPX0-1 && (job_in.start.x + size.x == nx_lap) ){\
CHECK_SIZE(x , callp)\
}
#define CHECK_Y(callm , callp)\
dim3 size;\
jobsize(&job_in , &size);\
if(npy == 0 && job_in.start.y == LAP){\
CHECK_SIZE(y , callm)\
}\
if(npy == NPY0-1 && (job_in.start.y + size.y == ny_lap) ){\
CHECK_SIZE(y , callp)\
}
#define CHECK_Z(callm , callp)\
dim3 size;\
jobsize(&job_in , &size);\
if(npz == 0 && job_in.start.z == LAP){\
CHECK_SIZE(z , callm)\
}\
if(npz == NPZ0-1 && (job_in.start.z + size.z == nz_lap) ){\
CHECK_SIZE(z , callp)\
}
#ifdef __cplusplus
extern "C"{
#endif
// =========================================================================================================== //
__device__ int OCFD_D0bound_scheme_kernel(REAL* tmp, dim3 flagxyzb, dim3 coords, REAL *stencil, int ka1, cudaJobPackage job){
switch(flagxyzb.y){
case 1:
{
if(coords.x == 0){
*tmp = (stencil[-ka1+1] - stencil[-ka1]);
return 0;
}else if(coords.x == 1){
*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
return 0;
}else if(coords.x == 2){
*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
return 0;
}else if(coords.x == 3){
*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
-9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
return 0;
}
}
break;
case 2:
{
if(coords.y == 0){
*tmp = (stencil[-ka1+1] - stencil[-ka1]);
return 0;
}else if(coords.y == 1){
*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
return 0;
}else if(coords.y == 2){
*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
return 0;
}else if(coords.y == 3){
*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
-9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
return 0;
}
}
break;
case 3:
{
if(coords.z == 0){
*tmp = (stencil[-ka1+1] - stencil[-ka1]);
return 0;
}else if(coords.z == 1){
*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
return 0;
}else if(coords.z == 2){
*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
return 0;
}else if(coords.z == 3){
*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
-9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
return 0;
}
}
break;
case 4:
{
if(coords.x == job.end.x-job.start.x-1){
*tmp = (stencil[-ka1] - stencil[-ka1-1]);
return 0;
}else if(coords.x == job.end.x-job.start.x-2){
*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
return 0;
}else if(coords.x == job.end.x-job.start.x-3){
*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
return 0;
}else if(coords.x == job.end.x-job.start.x-4){
*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
-9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
return 0;
}
}
break;
case 5:
{
if(coords.y == job.end.y-job.start.y-1){
*tmp = (stencil[-ka1] - stencil[-ka1-1]);
return 0;
}else if(coords.y == job.end.y-job.start.y-2){
*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
return 0;
}else if(coords.y == job.end.y-job.start.y-3){
*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
return 0;
}else if(coords.y == job.end.y-job.start.y-4){
*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
-9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
return 0;
}
}
break;
case 6:
{
if(coords.z == job.end.z-job.start.z-1){
*tmp = (stencil[-ka1] - stencil[-ka1-1]);
return 0;
}else if(coords.z == job.end.z-job.start.z-2){
*tmp = (stencil[-ka1+1] - stencil[-ka1-1])*0.5;
return 0;
}else if(coords.z == job.end.z-job.start.z-3){
*tmp = (stencil[-ka1-2] - 8.0*stencil[-ka1-1] + 8.0*stencil[-ka1+1] - stencil[-ka1+2])/12.0;
return 0;
}else if(coords.z == job.end.z-job.start.z-4){
*tmp = (stencil[-ka1+3] - stencil[-ka1-3]
-9.0*(stencil[-ka1+2] - stencil[-ka1-2])
+45.0*(stencil[-ka1+1] - stencil[-ka1-1]))/60.0;
return 0;
}
}
break;
}
return 1;
}
__global__ void OCFD_Dx0_bound_kernel_m(cudaField f , cudaField fx , cudaJobPackage job){
// eyes on cells WITHOUT LAP
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(y < job.end.y && z < job.end.z){
// 0
get_Field(fx , 0 , y-LAP, z-LAP) = ( get_Field_LAP(f , LAP+1 , y , z) - get_Field_LAP(f , LAP , y , z))/hx_d;
get_Field(fx , 1 , y-LAP, z-LAP) = ( get_Field_LAP(f , LAP+2 , y , z) - get_Field_LAP(f , LAP , y , z))*0.5/hx_d;
get_Field(fx , 2 , y-LAP, z-LAP) = ( get_Field_LAP(f , LAP , y , z) - 8.0*get_Field_LAP(f , LAP+1 , y , z)
+ 8.0*get_Field_LAP(f , LAP+3 , y , z) - get_Field_LAP(f , LAP+4 , y , z))/(12.0*hx_d);
get_Field(fx , 3 , y-LAP , z-LAP) = ( get_Field_LAP(f , LAP+6 , y , z) - get_Field_LAP(f , LAP , y , z)
-9.0*(get_Field_LAP(f , LAP+5 , y , z) - get_Field_LAP(f , LAP+1 , y , z) )
+45.0*(get_Field_LAP(f , LAP+4 , y , z) - get_Field_LAP(f , LAP+2 , y , z)) )/(60.0*hx_d);
}
}
__global__ void OCFD_Dx0_bound_kernel_p(cudaField f , cudaField fx , cudaJobPackage job){
// eyes on cells WITHOUT LAP
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(y < job.end.y && z < job.end.z){
unsigned int tmp = nx_d+LAP-1;
get_Field(fx , nx_d - 1 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp , y , z) - get_Field_LAP(f , tmp-1 , y , z))/hx_d;
get_Field(fx , nx_d - 2 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp , y , z) - get_Field_LAP(f , tmp-2 , y , z))*0.5/hx_d;
get_Field(fx , nx_d - 3 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp-4 , y , z) - 8.0*get_Field_LAP(f , tmp-3 , y , z)
+ 8.0*get_Field_LAP(f , tmp-1 , y , z) - get_Field_LAP(f , tmp , y , z))/(12.0*hx_d);
get_Field(fx , nx_d - 4 , y-LAP , z-LAP) = ( get_Field_LAP(f , tmp , y , z) - get_Field_LAP(f , tmp-6 , y , z)
-9.0*( get_Field_LAP(f , tmp - 1 , y , z) - get_Field_LAP(f , tmp - 5 , y , z) )
+45.0*( get_Field_LAP(f , tmp - 2 , y , z) - get_Field_LAP(f , tmp - 4 , y , z)) )/(60.0*hx_d);
}
}
// =========================================================================================================== //
__global__ void OCFD_Dy0_bound_kernel_m(cudaField f , cudaField fx , cudaJobPackage job){
// eyes on cells WITHOUT LAP
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(z < job.end.z && x < job.end.x){
get_Field(fx , x-LAP, 0, z-LAP) = ( get_Field_LAP(f , x, LAP+1, z) - get_Field_LAP(f ,x, LAP, z))/hy_d;
get_Field(fx , x-LAP, 1, z-LAP) = ( get_Field_LAP(f , x, LAP+2, z) - get_Field_LAP(f ,x, LAP, z))*0.5/hy_d;
get_Field(fx , x-LAP, 2, z-LAP) = ( get_Field_LAP(f , x, LAP, z) - 8.0*get_Field_LAP(f, x, LAP+1, z)
+ 8.0*get_Field_LAP(f , x, LAP+3, z) - get_Field_LAP(f, x, LAP+4, z))/(12.0*hy_d);
get_Field(fx , x-LAP, 3, z-LAP) = ( get_Field_LAP(f , x, LAP+6, z) - get_Field_LAP(f , x, LAP, z)
-9.0*(get_Field_LAP(f , x, LAP+5, z) - get_Field_LAP(f , x, LAP+1, z) )
+45.0*(get_Field_LAP(f , x, LAP+4, z) - get_Field_LAP(f , x, LAP+2, z)) )/(60.0*hy_d);
}
}
__global__ void OCFD_Dy0_bound_kernel_p(cudaField f , cudaField fx , cudaJobPackage job){
// eyes on cells WITHOUT LAP
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(z < job.end.z && x < job.end.x){
unsigned int tmp = ny_d+LAP-1;
get_Field(fx, x-LAP, ny_d-1, z-LAP) = -( get_Field_LAP(f , x, tmp-1, z) - get_Field_LAP(f ,x, tmp, z))/hy_d;
get_Field(fx, x-LAP, ny_d-2, z-LAP) = -( get_Field_LAP(f , x, tmp-2, z) - get_Field_LAP(f ,x, tmp, z))*0.5/hy_d;
get_Field(fx, x-LAP, ny_d-3, z-LAP) = -( get_Field_LAP(f , x, tmp, z) - 8.0*get_Field_LAP(f, x, tmp-1, z)
+ 8.0*get_Field_LAP(f , x, tmp-3, z) - get_Field_LAP(f, x, tmp-4, z))/(12.0*hy_d);
get_Field(fx, x-LAP, ny_d-4, z-LAP) = -( get_Field_LAP(f , x, tmp-6, z) - get_Field_LAP(f , x, tmp, z)
-9.0*(get_Field_LAP(f , x, tmp-5, z) - get_Field_LAP(f , x, tmp-1, z) )
+45.0*(get_Field_LAP(f , x, tmp-4, z) - get_Field_LAP(f , x, tmp-2, z)) )/(60.0*hy_d);
}
}
// =========================================================================================================== //
__global__ void OCFD_Dz0_bound_kernel_m(cudaField f , cudaField fx , cudaJobPackage job){
// eyes on cells WITHOUT LAP
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(y < job.end.y && x < job.end.x){
get_Field(fx , x-LAP,y-LAP, 0) = ( get_Field_LAP(f , x,y, LAP+1) - get_Field_LAP(f ,x,y, LAP))/hz_d;
get_Field(fx , x-LAP,y-LAP, 1) = ( get_Field_LAP(f , x,y, LAP+2) - get_Field_LAP(f ,x,y, LAP))*0.5/hz_d;
get_Field(fx , x-LAP,y-LAP, 2) = ( get_Field_LAP(f , x, y , LAP ) - 8.0*get_Field_LAP(f , x,y, LAP+1)
+ 8.0*get_Field_LAP(f , x, y , LAP+3) - get_Field_LAP(f , x,y, LAP+4))/(12.0*hz_d);
get_Field(fx , x-LAP,y-LAP, 3) = ( get_Field_LAP(f , x,y, LAP+6) - get_Field_LAP(f , x,y, LAP)
-9.0*(get_Field_LAP(f , x,y, LAP+5) - get_Field_LAP(f , x,y, LAP+1) )
+45.0*(get_Field_LAP(f , x,y, LAP+4) - get_Field_LAP(f , x,y, LAP+2)) )/(60.0*hz_d);
}
}
__global__ void OCFD_Dz0_bound_kernel_p(cudaField f , cudaField fx , cudaJobPackage job){
// eyes on cells WITHOUT LAP
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(y < job.end.y && x < job.end.x){
unsigned int tmp = nz_d+LAP-1;
get_Field(fx , x-LAP,y-LAP, nz_d - 1)= ( get_Field_LAP(f , x,y, tmp) - get_Field_LAP(f , x,y, tmp-1))/hz_d;
get_Field(fx , x-LAP,y-LAP, nz_d - 2) = ( get_Field_LAP(f , x,y, tmp) - get_Field_LAP(f , x,y, tmp-2))*0.5/hz_d;
get_Field(fx , x-LAP,y-LAP, nz_d - 3) = ( get_Field_LAP(f , x,y, tmp-4) - 8.0*get_Field_LAP(f , x,y, tmp-3)
+ 8.0*get_Field_LAP(f , x,y, tmp-1) - get_Field_LAP(f , x,y, tmp ))/(12.0*hz_d);
get_Field(fx , x-LAP,y-LAP, nz_d - 4) = ( get_Field_LAP(f , x,y, tmp ) - get_Field_LAP(f , x,y, tmp - 6)
-9.0*(get_Field_LAP(f , x,y, tmp-1) - get_Field_LAP(f , x,y, tmp - 5) )
+45.0*(get_Field_LAP(f , x,y, tmp-2) - get_Field_LAP(f , x,y, tmp - 4)) )/(60.0*hz_d);
}
}
void OCFD_Dx0_bound(cudaField f , cudaField fx , cudaJobPackage job_in , dim3 blockdim_in, cudaStream_t *stream){
CHECK_X(
{
CUDA_LAUNCH(( OCFD_Dx0_bound_kernel_m<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
},{
CUDA_LAUNCH(( OCFD_Dx0_bound_kernel_p<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
}
)
}
void OCFD_Dy0_bound(cudaField f , cudaField fx , cudaJobPackage job_in , dim3 blockdim_in, cudaStream_t *stream){
CHECK_Y(
{
CUDA_LAUNCH(( OCFD_Dy0_bound_kernel_m<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
},
{
CUDA_LAUNCH(( OCFD_Dy0_bound_kernel_p<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
}
)
}
void OCFD_Dz0_bound(cudaField f , cudaField fx , cudaJobPackage job_in , dim3 blockdim_in, cudaStream_t *stream){
CHECK_Z(
{
CUDA_LAUNCH(( OCFD_Dz0_bound_kernel_m<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
},
{
CUDA_LAUNCH(( OCFD_Dz0_bound_kernel_p<<<griddim , blockdim, 0, *stream>>>(f, fx, job_in) ));
}
)
}
void OCFD_bound(dim3 *flagxyzb, int boundp, int boundm, cudaJobPackage job){
// eyes on field WITH LAPs
dim3 size;
jobsize(&job, &size);
switch(flagxyzb->x){
case 1:
case 4:
{
if(npx == 0 && job.start.x == LAP && boundp == 1) flagxyzb->y = 1;
if(npx == NPX0-1 && job.end.x == nx_lap && boundm == 1) flagxyzb->y = 4;
}
break;
case 2:
case 5:
{
if(npy == 0 && job.start.y == LAP && boundp == 1) flagxyzb->y = 2;
if(npy == NPY0-1 && job.end.y == ny_lap && boundm == 1) flagxyzb->y = 5;
}
break;
case 3:
case 6:
{
if(npz == 0 && job.start.z == LAP && boundp == 1) flagxyzb->y = 3;
if(npz == NPZ0-1 && job.end.z == nz_lap && boundm == 1) flagxyzb->y = 6;
}
break;
}
}
/*__device__ int OCFD_bound_scheme_kernel_p(int flag, dim3 flagxyzb, dim3 coords, cudaSoA du, int num, cudaField fx, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
unsigned int offset_out = job.start.x + fx.pitch*(job.start.y + ny_d*job.start.z);
if(flag != 0){
switch(flagxyzb.x){
case 4:
if(threadIdx.x != 0) get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp_r - tmp_l)/hx_d;
break;
case 5:
if(threadIdx.x != 0) get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp_r - tmp_l)/hy_d;
break;
case 6:
if(threadIdx.x != 0) get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp_r - tmp_l)/hz_d;
break;
}
switch(flagxyzb.y){
case 4:
{
if(coords.x == job.end.x-job.start.x-1){
REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hx_d;
return 0;
}else if(coords.x >= job.end.x-job.start.x-kb1){
REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hx_d;
return 0;
}
}
break;
case 5:
{
if(coords.y == job.end.y-job.start.y-1){
REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hy_d;
return 0;
}else if(coords.y >= job.end.y-job.start.y-kb1){
REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hy_d;
return 0;
}
}
break;
case 6:
{
if(coords.z == job.end.z-job.start.z-1){
REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hz_d;
return 0;
}else if(coords.z >= job.end.z-job.start.z-kb1){
REAL tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
REAL tmp2 = stencil[-ka1-1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1-1] - stencil[-ka1-2]);
get_Field(fx, coords.x-LAP, coords.y-LAP, coords.z-LAP, offset_out) = (tmp - tmp2)/hz_d;
return 0;
}
}
break;
}
}
return flag;
}*/
__device__ REAL OCFD_weno5_kernel_P_right(REAL *stencil){
REAL S2 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[0] - 2.0*stencil[1] + stencil[2]; S2 += 13*tmp*tmp;
tmp = stencil[0] - 4.0*stencil[1] + 3.0*stencil[2]; S2 += 3*tmp*tmp;
REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
REAL q23 = (2.0*stencil[0] - 7.0*stencil[1] + 11.0*stencil[2]);
tmp = a2*q23/(6.0*a2);
return tmp;
}
__device__ REAL OCFD_weno5_kernel_P_lift(REAL *stencil){
REAL S0 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[2] - 2.0*stencil[3] + stencil[4]; S0 += 13*tmp*tmp;
tmp = 3.0*stencil[2] - 4.0*stencil[3] + stencil[4]; S0 += 3*tmp*tmp;
REAL a0 = 1.0/((12.0*ep + S0)*(12.0*ep + S0));
REAL q03 = (2.0*stencil[2] + 5.0*stencil[3] - stencil[4]);
tmp = a0*q03/(6.0*a0);
return tmp;
}
__device__ REAL OCFD_weno5_kernel_M_right(REAL *stencil){
REAL S0 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[2] - 2.0*stencil[1] + stencil[0]; S0 += 13*tmp*tmp;
tmp = 3.0*stencil[2] - 4.0*stencil[1] + stencil[0]; S0 += 3*tmp*tmp;
REAL a0 = 1.0/((12.0*ep + S0)*(12.0*ep + S0));
REAL q03 = (2.0*stencil[2] + 5.0*stencil[1] - stencil[0]);
tmp = a0*q03/(6.0*a0);
return tmp;
}
__device__ REAL OCFD_weno5_kernel_M_lift(REAL *stencil){
REAL S2 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[4] - 2.0*stencil[3] + stencil[2]; S2 += 13*tmp*tmp;
tmp = stencil[4] - 4.0*stencil[3] + 3.0*stencil[2]; S2 += 3*tmp*tmp;
REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
REAL q23 = (2.0*stencil[4] - 7.0*stencil[3] + 11.0*stencil[2]);
tmp = a2*q23/(6.0*a2);
return tmp;
}
__device__ REAL OCFD_weno5_kernel_P_right_plus(REAL *stencil){
REAL S1 = 0.0, S2 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[1] - 2.0*stencil[2] + stencil[3]; S1 += 13*tmp*tmp;
tmp = stencil[1] - stencil[3]; S1 += 3*tmp*tmp;
REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
REAL q13 = (-stencil[1] + 5.0*stencil[2] + 2.0*stencil[3]);
tmp = stencil[0] - 2.0*stencil[1] + stencil[2]; S2 += 13*tmp*tmp;
tmp = stencil[0] - 4.0*stencil[1] + 3.0*stencil[2]; S2 += 3*tmp*tmp;
REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
REAL q23 = (2.0*stencil[0] - 7.0*stencil[1] + 11.0*stencil[2]);
tmp = (a1*q13 + a2*q23)/(6.0*(a1 + a2));
return tmp;
}
__device__ REAL OCFD_weno5_kernel_P_lift_plus(REAL *stencil){
REAL S0 = 0.0, S1 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[2] - 2.0*stencil[3] + stencil[4]; S0 += 13*tmp*tmp;
tmp = 3.0*stencil[2] - 4.0*stencil[3] + stencil[4]; S0 += 3*tmp*tmp;
REAL a0 = 3.0/((12.0*ep + S0)*(12.0*ep + S0));
REAL q03 = (2.0*stencil[2] + 5.0*stencil[3] - stencil[4]);
tmp = stencil[1] - 2.0*stencil[2] + stencil[3]; S1 += 13*tmp*tmp;
tmp = stencil[1] - stencil[3]; S1 += 3*tmp*tmp;
REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
REAL q13 = (-stencil[1] + 5.0*stencil[2] + 2.0*stencil[3]);
tmp = (a0*q03 + a1*q13)/(6.0*(a0 + a1));
return tmp;
}
__device__ REAL OCFD_weno5_kernel_M_right_plus(REAL *stencil){
REAL S0 = 0.0, S1 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[2] - 2.0*stencil[1] + stencil[0]; S0 += 13*tmp*tmp;
tmp = 3.0*stencil[2] - 4.0*stencil[1] + stencil[0]; S0 += 3*tmp*tmp;
REAL a0 = 3.0/((12.0*ep + S0)*(12.0*ep + S0));
REAL q03 = (2.0*stencil[2] + 5.0*stencil[1] - stencil[0]);
tmp = stencil[3] - 2.0*stencil[2] + stencil[1]; S1 += 13*tmp*tmp;
tmp = stencil[3] - stencil[1]; S1 += 3*tmp*tmp;
REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
REAL q13 = (-stencil[3] + 5.0*stencil[2] + 2.0*stencil[1]);
tmp = (a0*q03 + a1*q13)/(6.0*(a0 + a1));
return tmp;
}
__device__ REAL OCFD_weno5_kernel_M_lift_plus(REAL *stencil){
REAL S1 = 0.0, S2 = 0.0;
REAL tmp;
REAL ep = 1e-6;
tmp = stencil[3] - 2.0*stencil[2] + stencil[1]; S1 += 13*tmp*tmp;
tmp = stencil[3] - stencil[1]; S1 += 3*tmp*tmp;
REAL a1 = 6.0/((12.0*ep + S1)*(12.0*ep + S1));
REAL q13 = (-stencil[3] + 5.0*stencil[2] + 2.0*stencil[1]);
tmp = stencil[4] - 2.0*stencil[3] + stencil[2]; S2 += 13*tmp*tmp;
tmp = stencil[4] - 4.0*stencil[3] + 3.0*stencil[2]; S2 += 3*tmp*tmp;
REAL a2 = 1.0/((12.0*ep + S2)*(12.0*ep + S2));
REAL q23 = (2.0*stencil[4] - 7.0*stencil[3] + 11.0*stencil[2]);
tmp = (a1*q13 + a2*q23)/(6.0*(a1 + a2));
return tmp;
}
//tmp = (2.0*stencil[-ka1+1] + 5.0*stencil[-ka1+2] + stencil[-ka1+3])/6.0;
//tmp = (11.0*stencil[-ka1] - 7.0*stencil[-ka1-1] + 2.0*stencil[-ka1-2])/6.0;
__device__ int OCFD_bound_scheme_kernel_p(REAL* tmp, dim3 flagxyzb, dim3 coords, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
switch(flagxyzb.y){
case 1:
{
if(coords.x <= -ka1){
if(coords.x == 0){
*tmp = stencil[-ka1+1];
}
if(coords.x == 1){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
*tmp = OCFD_weno5_kernel_P_lift(&stencil[-ka1-2]);
}
if(coords.x == 2){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_P_lift_plus(&stencil[-ka1-2]);
}
if(coords.x == 3){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
}
return 0;
}
}
break;
case 2:
{
if(coords.y <= -ka1){
if(coords.y == 0){
*tmp = stencil[-ka1+1];
}
if(coords.y == 1){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
*tmp = OCFD_weno5_kernel_P_lift(&stencil[-ka1-2]);
}
if(coords.y == 2){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_P_lift_plus(&stencil[-ka1-2]);
}
if(coords.y == 3){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
}
return 0;
}
}
break;
case 3:
{
if(coords.z <= -ka1){
if(coords.z == 0){
*tmp = stencil[-ka1+1];
}
if(coords.z == 1){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
*tmp = OCFD_weno5_kernel_P_lift(&stencil[-ka1-2]);
}
if(coords.z == 2){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_P_lift_plus(&stencil[-ka1-2]);
}
if(coords.z == 3){
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
}
return 0;
}
}
break;
case 4:
{
if(coords.x > job.end.x-job.start.x-kb1){
if(coords.x == job.end.x-job.start.x-1){
//*tmp = OCFD_weno5_kernel_P_right(&stencil[-ka1-2]);
*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.x == job.end.x-job.start.x-2){
*tmp = OCFD_weno5_kernel_P_right_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.x <= job.end.x-job.start.x-3){
*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
return 0;
}
}
break;
case 5:
{
if(coords.y > job.end.y-job.start.y-kb1){
if(coords.y == job.end.y-job.start.y-1){
//*tmp = OCFD_weno5_kernel_P_right(&stencil[-ka1-2]);
*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.y == job.end.y-job.start.y-2){
*tmp = OCFD_weno5_kernel_P_right_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.y <= job.end.y-job.start.y-3){
*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
return 0;
}
}
break;
case 6:
{
if(coords.z > job.end.z-job.start.z-kb1){
if(coords.z == job.end.z-job.start.z-1){
//*tmp = OCFD_weno5_kernel_P_right(&stencil[-ka1-2]);
*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.z == job.end.z-job.start.z-2){
*tmp = OCFD_weno5_kernel_P_right_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.z <= job.end.z-job.start.z-3){
*tmp = OCFD_weno5_kernel_P(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] + 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
return 0;
}
}
break;
}
return 1;
}
__device__ int OCFD_bound_scheme_kernel_m(REAL* tmp, dim3 flagxyzb, dim3 coords, REAL *stencil, int ka1, int kb1, cudaJobPackage job){
switch(flagxyzb.y){
case 1:
{
if(coords.x < -ka1){
if(coords.x == 0){
*tmp = OCFD_weno5_kernel_M_lift(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
}
if(coords.x == 1){
*tmp = OCFD_weno5_kernel_M_lift_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.x >= 2){
*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
return 0;
}
}
break;
case 2:
{
if(coords.y < -ka1){
if(coords.y == 0){
*tmp = OCFD_weno5_kernel_M_lift(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
}
if(coords.y == 1){
*tmp = OCFD_weno5_kernel_M_lift_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.y >= 2){
*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
return 0;
}
}
break;
case 3:
{
if(coords.z < -ka1){
if(coords.z == 0){
*tmp = OCFD_weno5_kernel_M_lift(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1+1] - stencil[-ka1]);
}
if(coords.z == 1){
*tmp = OCFD_weno5_kernel_M_lift_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.z >= 2){
*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
return 0;
}
}
break;
case 4:
{
{
if(coords.x >= job.end.x-job.start.x-kb1-1){
if(coords.x == job.end.x-job.start.x-1){
*tmp = stencil[-ka1-1];
}
if(coords.x == job.end.x-job.start.x-2){
//*tmp = OCFD_weno5_kernel_M_right(&stencil[-ka1-2]);
*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.x == job.end.x-job.start.x-3){
*tmp = OCFD_weno5_kernel_M_right_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.x == job.end.x-job.start.x-4){
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
}
return 0;
}
}
}
break;
case 5:
{
{
if(coords.y >= job.end.y-job.start.y-kb1-1){
if(coords.y == job.end.y-job.start.y-1){
*tmp = stencil[-ka1-1];
}
if(coords.y == job.end.y-job.start.y-2){
//*tmp = OCFD_weno5_kernel_M_right(&stencil[-ka1-2]);
*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.y == job.end.y-job.start.y-3){
*tmp = OCFD_weno5_kernel_M_right_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.y == job.end.y-job.start.y-4){
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
}
return 0;
}
}
}
break;
case 6:
{
{
if(coords.z >= job.end.z-job.start.z-kb1-1){
if(coords.z == job.end.z-job.start.z-1){
*tmp = stencil[-ka1-1];
}
if(coords.z == job.end.z-job.start.z-2){
//*tmp = OCFD_weno5_kernel_M_right(&stencil[-ka1-2]);
*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1] - stencil[-ka1-1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.z == job.end.z-job.start.z-3){
*tmp = OCFD_weno5_kernel_M_right_plus(&stencil[-ka1-2]);
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
}
if(coords.z == job.end.z-job.start.z-4){
//*tmp = stencil[-ka1] - 0.5*minmod2(stencil[-ka1+1] - stencil[-ka1], stencil[-ka1] - stencil[-ka1-1]);
*tmp = OCFD_weno5_kernel_M(&stencil[-ka1-2]);
}
return 0;
}
}
}
break;
}
return 1;
}
#ifdef __cplusplus
}
#endif
//-------Boundary condition --------------------------------------------------------
#include "stdlib.h"
#include "stdio.h"
#include "parameters.h"
#include "utility.h"
#include "OCFD_boundary_Liftbody3D.h"
#include "OCFD_boundary_compression_conner.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
#include "commen_kernel.h"
#include "parameters_d.h"
#ifdef __cplusplus
extern "C"{
#endif
void OCFD_bc()
{
//---------------------------------------------------
switch(IBC_USER){
case 124:
if(Init_stat == 0){
bc_user_Liftbody3d_simple();
}else{
bc_user_Liftbody3d();
}
break;
case 108:
bc_user_Compression_conner();
break;
default:
break;
}
//--------------------------------------------
if (npx == 0)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(0 , 0 , 0) , dim3(1 , ny , nz) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , 1 , ny , nz);
pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
}
if (npx == NPX0 - 1)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(nx-1 , 0 , 0) , dim3(nx , ny , nz) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , 1 , ny , nz);
pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
}
//------------------------------
if (npy == 0)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(0 , 0 , 0) , dim3(nx , 1 , nz) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , 1 , nz);
pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
}
if (npy == NPY0 - 1)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(0 , ny-1 , 0) , dim3(nx , ny , nz) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , 1 , nz);
pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
}
//--------------------------
if (npz == 0)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(0 , 0 , 0) , dim3(nx , ny , 1) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , 1);
pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
}
if (npz == NPZ0 - 1)
{
dim3 griddim , blockdim;
cudaJobPackage job( dim3(0 , 0 , nz-1) , dim3(nx , ny , nz) );
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , ny , 1);
pri_to_cons_kernel<<<griddim , blockdim>>>(*pf_d , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job);
}
}
#ifdef __cplusplus
}
#endif
// Boundary condition for flow over a 3D Liftbody -------------------------
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "parameters.h"
#include "parameters_d.h"
#include "utility.h"
#include "io_warp.h"
#include "cuda_commen.h"
#include "cuda_utility.h"
//#include "OCFD_boundary_init.h"
#include "OCFD_init.h"
#ifdef __cplusplus
extern "C"{
#endif
extern cudaField *pu2d_inlet_d; //[5][nz][ny]
extern cudaField *pu2d_upper_d; //[5][ny][nx]
//extern cudaField *pv_dist_wall_d; // [ny][nx]
extern cudaField *pv_dist_coeff_d; // [3][ny][nx]
extern cudaField *pu_dist_upper_d; // [ny][nx]
extern const char v_dist_need;
extern const char TW_postive;
extern REAL *fait;
extern REAL *TM;
__global__ void do_u2d_inlet_kernel(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField inlet , cudaJobPackage job){
// with LAPs
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(y < job.end.y && z < job.end.z){
unsigned int ylap = y-LAP;
unsigned int zlap = z-LAP;
for(int i = 0; i <= LAP; i++){
get_Field_LAP(d, i, y, z) = *(inlet.ptr + ylap + inlet.pitch * zlap);
get_Field_LAP(u, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 1*nz_d) );
get_Field_LAP(v, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 2*nz_d) );
get_Field_LAP(w, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 3*nz_d) );
get_Field_LAP(T, i, y, z) = *(inlet.ptr + ylap + inlet.pitch *( zlap + 4*nz_d) );
}
}
}
/* ================================= */
__global__ void do_u2d_upper_kernel(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField upper , cudaField dist , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(x < job.end.x && y < job.end.y){
unsigned int xlap = x-LAP;
unsigned int ylap = y-LAP;
unsigned int ztmp = nz_lap_d - 1;
get_Field_LAP(d , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch * ylap );
get_Field_LAP(u , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 1*ny_d) ) + *(dist.ptr + xlap + dist.pitch * ylap );
get_Field_LAP(v , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 2*ny_d) );
get_Field_LAP(w , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 3*ny_d) );
get_Field_LAP(T , x , y , ztmp) = *(upper.ptr + xlap + upper.pitch *( ylap + 4*ny_d) );
}
}
__global__ void do_u_dist_upper_kernel(REAL sin_aoa , REAL cos_aoa , cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField dist , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(x < job.end.x && y < job.end.y){
unsigned int xlap = x-LAP;
unsigned int ylap = y-LAP;
unsigned int ztmp = nz_lap_d - 1;
get_Field_LAP(d , x , y , ztmp) = 1.0;
get_Field_LAP(u , x , y , ztmp) = cos_aoa + *(dist.ptr + xlap + dist.pitch * ylap );
get_Field_LAP(v , x , y , ztmp) = 0.0;
get_Field_LAP(w , x , y , ztmp) = sin_aoa;
get_Field_LAP(T , x , y , ztmp) = 1.0;
}
}
/* ============================================= */
__global__ void do_symmetry_kernel_m(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
unsigned int ys = 2*LAP - y;
get_Field_LAP(d, x, y, z) = get_Field_LAP(d, x, ys, z);
get_Field_LAP(u, x, y, z) = get_Field_LAP(u, x, ys, z);
get_Field_LAP(v, x, y, z) = -1.0*get_Field_LAP(v, x, ys, z);
get_Field_LAP(w, x, y, z) = get_Field_LAP(w, x, ys, z);
get_Field_LAP(T, x, y, z) = get_Field_LAP(T, x, ys, z);
}
}
__global__ void do_symmetry_kernel_p(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z){
unsigned int ys = 2*(ny_lap_d - 1) - y;
get_Field_LAP(d, x, y, z) = get_Field_LAP(d, x, ys, z);
get_Field_LAP(u, x, y, z) = get_Field_LAP(u, x, ys, z);
get_Field_LAP(v, x, y, z) = -1.0*get_Field_LAP(v, x, ys, z);
get_Field_LAP(w, x, y, z) = get_Field_LAP(w, x, ys, z);
get_Field_LAP(T, x, y, z) = get_Field_LAP(T, x, ys, z);
}
}
/* =============================================== */
__global__ void do_wall_kernel_T_V(REAL tw , cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , REAL HT , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(x < job.end.x && y < job.end.y){
{
unsigned int xlap = x-LAP;
unsigned int ylap = y-LAP;
get_Field_LAP(u , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch * ylap ) * HT;
get_Field_LAP(v , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 1*ny_d) ) * HT;
get_Field_LAP(w , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 2*ny_d) ) * HT;
}
get_Field_LAP(T , x , y , LAP) = tw;
get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x,y,LAP+1) * get_Field_LAP(T , x , y , LAP+1) - get_Field_LAP(d , x,y,LAP+2) * get_Field_LAP(T , x , y , LAP+2))/(3.0*tw);
}
}
__global__ void do_wall_kernel_NT_V(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , REAL HT , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(x < job.end.x && y < job.end.y){
{
unsigned int xlap = x-LAP;
unsigned int ylap = y-LAP;
get_Field_LAP(u , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch * ylap ) * HT;
get_Field_LAP(v , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 1*ny_d) ) * HT;
get_Field_LAP(w , x , y , LAP) = *(coeff.ptr + xlap + coeff.pitch *( ylap + 2*ny_d) ) * HT;
}
get_Field_LAP(T , x , y , LAP) = (4.0 * get_Field_LAP(T , x , y , LAP+1) - get_Field_LAP(T , x , y , LAP+2))/3.0;
get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x , y , LAP+1) - get_Field_LAP(d , x , y , LAP+2))/3.0;
}
}
__global__ void do_wall_kernel_T_NV(REAL tw , cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(x < job.end.x && y < job.end.y){
get_Field_LAP(u , x , y , LAP) = 0.0;
get_Field_LAP(v , x , y , LAP) = 0.0;
get_Field_LAP(w , x , y , LAP) = 0.0;
get_Field_LAP(T , x , y , LAP) = tw;
get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x,y,LAP+1) * get_Field_LAP(T , x , y , LAP+1) - get_Field_LAP(d , x,y,LAP+2) * get_Field_LAP(T , x , y , LAP+2))/(3.0*tw);
}
}
__global__ void do_wall_kernel_NT_NV(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaField coeff , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
if(x < job.end.x && y < job.end.y){
get_Field_LAP(u , x , y , LAP) = 0.0;
get_Field_LAP(v , x , y , LAP) = 0.0;
get_Field_LAP(w , x , y , LAP) = 0.0;
get_Field_LAP(T , x , y , LAP) = (4.0 * get_Field_LAP(T , x , y , LAP+1) - get_Field_LAP(T , x , y , LAP+2))/3.0;
get_Field_LAP(d , x , y , LAP) = (4.0 * get_Field_LAP(d , x , y , LAP+1) - get_Field_LAP(d , x , y , LAP+2))/3.0;
}
}
/* ---------------------------------------------- */
/* ======================================== */
void bc_user_Liftbody3d(){
//-------------- boundary condition at i=1 (inlet) -----------------------------------------
if (npx == 0)
{
if (IF_WITHLEADING == 1)
{
printf(" Lift body with leading is not support yet \n");
exit(EXIT_FAILURE);
}
else
{ // without leading
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , 1 , BlockDimY , BlockDimZ , 1 , ny , nz);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(LAP+1, ny_lap ,nz_lap) );
CUDA_LAUNCH(( do_u2d_inlet_kernel<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pu2d_inlet_d , job) ));
}
}
//---------------------bounrary at k=nz (upper) ------------------------------------------
if (npz == NPZ0 - 1)
{
if (IFLAG_UPPERBOUNDARY == 0)
{ // Out of blow shock
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( do_u_dist_upper_kernel<<<griddim , blockdim>>>( Sin_AOA , Cos_AOA ,*pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pu_dist_upper_d, job) ));
}
else if (IFLAG_UPPERBOUNDARY == 1)
{ // In the blow shock
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( do_u2d_upper_kernel<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pu2d_upper_d , *pu_dist_upper_d, job) ));
}
}
//---------------------wall ------------------------------------
REAL ht = 0.;
if(BETA > 0.){
for(int m = 0; m < MTMAX; m++){
//ht = ht + TM[m] * sin((m + 1)*BETA*tt + 2.*PI*fait[m]);
ht = ht + TM[m] * sin((m + 1)*BETA*tt);
}
}else{
ht = 1.;
}
if (npz == 0)
{
if(v_dist_need){
if(TW_postive){
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( do_wall_kernel_T_V<<<griddim , blockdim>>>( TW , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d , ht , job) ));
}else{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( do_wall_kernel_NT_V<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d, ht , job) ));
}
}else{
if(TW_postive){
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( do_wall_kernel_T_NV<<<griddim , blockdim>>>(TW , *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d , job) ));
}else{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( do_wall_kernel_NT_NV<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , *pv_dist_coeff_d , job) ));
}
}
}
//------------------------------------------------------------
//------------ Symmetry -----------
if (npy == 0 && IF_SYMMETRY == 1)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
cudaJobPackage job( dim3(LAP,0,LAP) , dim3(nx_lap, LAP ,nz_lap) );
CUDA_LAUNCH(( do_symmetry_kernel_m<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
if (npy == NPY0 - 1 && IF_SYMMETRY == 1)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
cudaJobPackage job( dim3(LAP,ny_lap,LAP) , dim3(nx_lap, ny_2lap ,nz_lap) );
CUDA_LAUNCH(( do_symmetry_kernel_p<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
}
/* =============================================================================== */
__global__ void simple_boundary_condition(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z ){
get_Field_LAP(d , x , y , z) = 1.0;
get_Field_LAP(u , x , y , z) = 1.0;
get_Field_LAP(v , x , y , z) = 0.0;
get_Field_LAP(w , x , y , z) = 0.0;
get_Field_LAP(T , x , y , z) = 1.0;
}
}
__global__ void out_boundary_condition(cudaField d , cudaField u , cudaField v , cudaField w , cudaField T , cudaJobPackage job){
// with LAPs
unsigned int x = blockDim.x * blockIdx.x + threadIdx.x + job.start.x;
unsigned int y = blockDim.y * blockIdx.y + threadIdx.y + job.start.y;
unsigned int z = blockDim.z * blockIdx.z + threadIdx.z + job.start.z;
if(x < job.end.x && y < job.end.y && z < job.end.z ){
get_Field_LAP(d , x , y , z) = get_Field_LAP(d , x-1 , y , z);
get_Field_LAP(u , x , y , z) = get_Field_LAP(u , x-1 , y , z);
get_Field_LAP(v , x , y , z) = get_Field_LAP(v , x-1 , y , z);
get_Field_LAP(w , x , y , z) = get_Field_LAP(w , x-1 , y , z);
get_Field_LAP(T , x , y , z) = get_Field_LAP(T , x-1 , y , z);
}
}
void bc_user_Liftbody3d_simple(){
//-------------- boundary condition at i=1 (inlet) -----------------------------------------
if (npx == 0)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , 1 , BlockDimY , BlockDimZ , 1 , ny , nz);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(LAP+1, ny_lap ,nz_lap) );
CUDA_LAUNCH(( simple_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
if (npx == NPX0 - 1)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , 1 , BlockDimY , BlockDimZ , 1 , ny , nz);
cudaJobPackage job( dim3(nx_lap,LAP,LAP) , dim3(nx_lap+1, ny_lap ,nz_lap) );
CUDA_LAUNCH(( out_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
//---------------------bounrary at k=nz (upper) ------------------------------------------
if (npz == NPZ0 - 1)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,nz_lap - 1) , dim3(nx_lap, ny_lap ,nz_lap) );
CUDA_LAUNCH(( simple_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
//---------------------wall ------------------------------------
if (npz == 0)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , 1 , nx , ny , 1);
cudaJobPackage job( dim3(LAP,LAP,LAP) , dim3(nx_lap, ny_lap ,LAP+1) );
CUDA_LAUNCH(( simple_boundary_condition<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
//------------------------------------------------------------
//------------ Symmetry -----------
if (npy == 0 && IF_SYMMETRY == 1)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
cudaJobPackage job( dim3(LAP,0,LAP) , dim3(nx_lap, LAP ,nz_lap) );
CUDA_LAUNCH(( do_symmetry_kernel_m<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
if (npy == NPY0 - 1 && IF_SYMMETRY == 1)
{
dim3 blockdim , griddim;
cal_grid_block_dim(&griddim , &blockdim , BlockDimX , BlockDimY , BlockDimZ , nx , LAP , nz);
cudaJobPackage job( dim3(LAP,ny_lap,LAP) , dim3(nx_lap, ny_2lap ,nz_lap) );
CUDA_LAUNCH(( do_symmetry_kernel_p<<<griddim , blockdim>>>( *pd_d , *pu_d , *pv_d , *pw_d , *pT_d , job) ));
}
}
#ifdef __cplusplus
}
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment