#include "hip/hip_runtime.h"
/*******************************************************************************************
 * This file contains the implementation of GPU host functions related to seulex compute.
 ******************************************************************************************/

#include "thermoFluid.h"
#include "species.h"
#include "seulex_kernel.h"

__global__ void initP_g(REAL *ptr, REAL *dt_sum_d, REAL t_end_h)
{
    unsigned int s = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int w = blockIdx.y * blockDim.y + threadIdx.y;

    size_t n_vars = sp_num_d + 2;

    if (w < n_vars && s < size_d && access_data(dt_sum_d, s) <= t_end_h)
    {
        access_P(ptr, w, s) = w;
    }
}

__host__ void initP_h(REAL *ptr) {
    
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    initP_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(ptr, dt_sum_d, t_end_h);
}

__global__ void copy_AtoU_g(REAL *src, REAL *ptr, REAL *dt_sum_d, REAL t_end_h)
{
    unsigned int s = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int j = blockIdx.z * blockDim.z + threadIdx.z;

    size_t n_vars = sp_num_d + 2;

    if (i < n_vars && j < n_vars && s < size_d && access_data(dt_sum_d, s) <= t_end_h)
    {
        REAL tmp;
        tmp = access_J(src, i, j, s);
        access_U(ptr, i, j, s) = tmp;
    }
}

__host__ void copy_AtoU_h(REAL *src, REAL *ptr) {
    
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid3d(size, sp_num+2, sp_num+2, block_set_copy, griddim, blockdim);

    copy_AtoU_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(src, ptr, dt_sum_d, t_end_h);
}


__device__ REAL isclose(REAL tmp1, REAL tmp2) {
    REAL tmp0 = tmp1 -tmp2;
    REAL rel_tol = 1e-6;
    REAL abs_tol = 1e-8;

    if (tmp0 <= max(rel_tol * max(fabs(tmp1), fabs(tmp2)), abs_tol)) {
        return 1;
    }

    return 0;
}

__global__ void PLU_decomposition_g(REAL *A, REAL *tmp_ptr_d, REAL *dt_sum_d, REAL t_end_h) {

    unsigned int s  = blockDim.x * blockIdx.x + threadIdx.x;
    size_t n_vars = sp_num_d + 2;

    //Prevention of border crossings
    if (s < size_d && access_data(dt_sum_d, s) <= t_end_h) {
        REAL tmp0, factor, tmp1;

        for (int n = 0; n < n_vars; n++) {
            access_L(tmp_ptr_d, n, n, s) = 1.;
        }

        for (int i = 0; i < n_vars; i++) {

            for (int k = i; k < n_vars; k++) {
                if (isclose(access_U(tmp_ptr_d, i, i, s), 0.) == 0.) break;

                for (size_t ii = 0; ii < n_vars; ii++) {
                    tmp0 = access_U(tmp_ptr_d, ii, k, s);
                    access_U(tmp_ptr_d, ii, k, s) = access_U(tmp_ptr_d, ii, k+1, s);
                    access_U(tmp_ptr_d, ii, k+1, s) = tmp0;
                }

                tmp0 = access_P(tmp_ptr_d, k, s);
                access_P(tmp_ptr_d, k, s) = access_P(tmp_ptr_d, k+1, s);
                access_P(tmp_ptr_d, k+1, s) = tmp0;
            }

            for (int k = i+1; k < n_vars; k++) {

                access_L(tmp_ptr_d, i, k, s) = tmp1 = access_U(tmp_ptr_d, i, k, s) / access_U(tmp_ptr_d, i, i, s);           

                for (size_t jj = i; jj < n_vars; jj++) {
                    access_U(tmp_ptr_d, jj, k, s) -= tmp1*access_U(tmp_ptr_d, jj, i, s);
                }
            }
        }
    }
}


/** LU decomposition for solving linear equations.
 */
__host__ void PLU_decomposition_h(REAL *A, REAL *tmp_ptr_d) {

    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    PLU_decomposition_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(A, tmp_ptr_d, dt_sum_d, t_end_h);
}


__global__ void compute_A_g(REAL *A_d, REAL *J_d, REAL *dt, int k, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && access_data(dt_sum_d, s) <= t_end_h) {
        size_t n_steps = n_seq_d[k];

        REAL dx_tot = access_data(dt, s);
        REAL dx = dx_tot / n_steps;

        for (size_t ii = 0; ii < n_vars; ii++) {
            access_U(A_d, ii, w, s) = - access_J(J_d, ii, w, s);
        }

        access_U(A_d, w, w, s) += 1./dx;
    }
}

__host__ void compute_A_h(REAL *A_d, REAL *J_d, REAL *dt, int k) {

    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);
    
    compute_A_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(A_d, J_d, dt, k, dt_sum_d, t_end_h);
}

__global__ void solve_linear_system_g(REAL *tmp_ptr_d, REAL *b, REAL *dt_sum_d, REAL t_end_h) {

    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    size_t n_vars = sp_num_d + 2;
    
    extern __shared__ REAL shared[];

    for (int w = 0; w < 2*n_vars; w++) {
        shared[w + threadIdx.x*2*n_vars] = 0;
    }

    //Prevention of border crossings
    if (s < size_d && access_data(dt_sum_d, s) <= t_end_h) {
        REAL y_w, tmp;

        for (int w = 0; w < n_vars; w++) {
            tmp = 0;
            for (int i = 0; i < w; i++) {
                tmp += access_L(tmp_ptr_d, i, w, s) * shared[i + threadIdx.x*n_vars];
            }

            shared[w + threadIdx.x*n_vars] = (access_vars_data(b, (int)access_P(tmp_ptr_d, w, s), s) - tmp)/
                                            access_L(tmp_ptr_d, w, w, s);
        }
        
        
        access_vars_data(b, n_vars - 1, s) = shared[n_vars - 1 + threadIdx.x*n_vars]
                / access_U(tmp_ptr_d, n_vars - 1, n_vars - 1, s);

        for (int w = n_vars - 2; w >= 0; w--) {
            tmp = 0;
            for (int i = w; i < n_vars; i++) {
                tmp += access_U(tmp_ptr_d, i, w, s) * shared[i + threadIdx.x*n_vars + blockDim.x*n_vars];
            }
            access_vars_data(b, w, s) = shared[w + threadIdx.x*n_vars + blockDim.x*n_vars] = (shared[w + threadIdx.x*n_vars] - tmp)/access_U(tmp_ptr_d, w, w, s);
        }
    }
}

/** LU decomposition for solving linear equations.
 */
__host__ void solve_linear_system_h(REAL *tmp_ptr_d, REAL *b) {

    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid(size, 32, griddim, blockdim);

    solve_linear_system_g<<<griddim, blockdim, 2*blockdim.x*(sp_num+2)*sizeof(REAL), Stream_opencc[0]>>>(tmp_ptr_d, b, dt_sum_d, t_end_h);
}

__device__ REAL compute_scale0_d(REAL *c) {

    REAL tmp;
    tmp = atol + rtol * fabs(*c);

    return tmp;
}

__global__ void compute_scale0_g(REAL *c, REAL *scale, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {

        REAL c_tmp = access_vars_data(c, w, s);
        access_vars_data(scale, w, s) = compute_scale0_d(&c_tmp);
    }
}

__host__ void compute_scale0_h(REAL *y, REAL *scale) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    compute_scale0_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(y, scale, dt_sum_d, t_end_h);
}

__global__ void init_y_temp_g(REAL *dy, REAL *c, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {
        access_vars_data(dy, w, s) = access_vars_data(c, w, s);
    }
}

__host__ void init_y_temp_h(REAL *dy, REAL *c) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    init_y_temp_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dy, c, dt_sum_d, t_end_h);
}

__global__ void update_y_temp_g(REAL *dy, REAL *dcdt, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {
        //access_vars_atomic(dy, w, s, access_vars_data(dcdt, w, s));
        access_vars_data(dy, w, s) += access_vars_data(dcdt, w, s); 
    }
}

__host__ void update_y_temp_h(REAL *dy, REAL *dcdt) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    update_y_temp_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dy, dcdt, dt_sum_d, t_end_h);
}

__global__ void compute_dy1_g(REAL *dcdt, REAL *scale, REAL *denom, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;

    if (s < size_d && access_data(dt_sum_d, s) <= t_end_h) {
        size_t n_vars = sp_num_d + 2;

        REAL tmp = 0., tmp1;

        for (int i = 0; i < n_vars; i++) {
            tmp1 = access_vars_data(dcdt, i, s) / access_vars_data(scale, i, s);
            tmp += tmp1*tmp1;
        }

        tmp = sqrt(tmp);

        access_data(denom, s) = fmin(1., tmp + SMALL);
        //access_data(denom, s) = fmax(1., tmp);

    }
}

__host__ void compute_dy1_h(REAL *dcdt, REAL *scale, REAL *denom) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    compute_dy1_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dcdt, scale, denom, dt_sum_d, t_end_h);
}

__global__ void update_dcdt_g(REAL *dydx, REAL *dt, int k, REAL *dcdt, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {
        size_t n_steps = n_seq_d[k];

        REAL dx_tot = access_data(dt, s);
        REAL dx = dx_tot / n_steps;

        access_vars_data(dcdt, w, s) = access_vars_data(dydx, w, s) - access_vars_data(dcdt, w, s)/dx;
    }
}

__host__ void update_dcdt_h(REAL *dydx, REAL *dt, int k, REAL *dcdt) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    update_dcdt_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dydx, dt, k, dcdt, dt_sum_d, t_end_h);
}

__global__ void update_c_g(REAL *c, REAL *y_tmp, REAL *dcdt, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {
        access_vars_data(c, w, s) = access_vars_data(y_tmp, w, s) + access_vars_data(dcdt, w, s); 
    }
}

__host__ void update_c_h(REAL *c, REAL *y_tmp, REAL *dcdt) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);
    update_c_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(c, y_tmp, dcdt, dt_sum_d, t_end_h);
}

__global__ void compute_table_g(int k, REAL *y_seq, REAL *table, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {
        access_table(table, w, k, s) = access_vars_data(y_seq, w, s);
    }
}

__host__ void compute_table_h(int k, REAL *y_seq, REAL *table) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    compute_table_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(k, y_seq, table, dt_sum_d, t_end_h);
}

__global__ void extrapolate_g(int k, REAL *table, REAL *y, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars && access_data(dt_sum_d, s) <= t_end_h) {
        REAL tmp0, tmp1;

        for (int j = k-1; j > 0; j--) {
            access_table(table, w, j-1, s) = access_table(table, w, j, s) + 
            coeff_d[j + k*(k_max+1)] * (access_table(table, w, j, s) - access_table(table, w, j-1, s));
        }

        tmp0 = access_table(table, w, 0, s);
        tmp1 = access_vars_data(y, w, s);

        access_vars_data(y, w, s) = tmp0 + coeff_d[k*(k_max+1)] * (tmp0 - tmp1); 
    }
}

__host__ void extrapolate_h(int k, REAL *table, REAL *y) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    extrapolate_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(k, table, y, dt_sum_d, t_end_h);
}

__global__ void compute_flag_g(REAL *denom, REAL *dy, REAL *scale, REAL *flag_reject, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d) {
        REAL tmp0, tmp1, tmp2, tmp3 = 0.;

        tmp2 = access_data(denom, s);

        for (int i = 0; i < n_vars; i++) {

            tmp0 = access_vars_data(dy, i, s);
            tmp1 = access_vars_data(scale, i, s);

            tmp3 += tmp0*tmp0/(tmp1*tmp1);

            if (tmp0 > tmp1 * tmp2) {
                access_data(flag_reject, s) = 1.0;
            }

            //tmp3 += pow(tmp0/tmp1, 2);

        }

        tmp3 = sqrt(tmp3)/tmp2;

        if (tmp3 > 1.) {
            access_data(flag_reject, s) = 1.0;
        }

        if (access_data(dt_sum_d, s) >= t_end_h) {
            access_data(flag_reject, s) = 0.0;
        }
    }

}

__host__ void compute_flag_h(REAL *denom, REAL *dy, REAL *scale, REAL *flag_reject) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    compute_flag_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(denom, dy, scale, flag_reject, dt_sum_d, t_end_h);

}

__global__ void err_compute_g(REAL *y, REAL *table, REAL *scale, REAL *flag_reject, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && access_data(dt_sum_d, s) <= t_end_h) {

        REAL err = 0., tmp0, tmp1;

        for (int i = 0; i < n_vars; i++) {
            tmp0 = (access_vars_data(y, i, s) - access_table(table, i, 0, s))/
                    access_vars_data(scale, i, s);
            err += tmp0*tmp0;
        }

        tmp1 = 1./n_vars;

        access_data(flag_reject, s) = 0.0;

        err = sqrt(err*tmp1);
        if (err > 1) {
            access_data(flag_reject, s) = 1.0;
        }

    }
}

__host__ void err_compute_h(REAL *y, REAL *table, REAL *scale, REAL *flag_reject) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    err_compute_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(y, table, scale, flag_reject, dt_sum_d, t_end_h);

}

__global__ void update_dx_g(REAL num, REAL *dt_sum, REAL dt_end, REAL *dx, REAL *dx_new, REAL *flag) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;

    if (s < size_d) {
        if (access_data(dt_sum, s) < dt_end && access_data(flag, s) == 0) {
            access_data(dx_new, s) = fabs(access_data(dx, s))*num;
        }
    }
}

__host__ void update_dx_h(REAL num, REAL *dt_sum, REAL dt_end, REAL *dx, REAL *dx_new, REAL *flag) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    update_dx_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(num, dt_sum, dt_end, dx, dx_new, flag);

}

__global__ void update_dx_modify_g(REAL num, REAL *dt_sum, REAL dt_end, REAL *dx, REAL *dx_new, REAL *flag, REAL *successed) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;

    if (s < size_d) {
        if (access_data(dt_sum, s) < dt_end && access_data(flag, s) != 0) {
            access_data(dx_new, s) = fabs(access_data(dx, s))*num;
            access_data(successed, s) = 1.;
        }
    }
}

__host__ void update_dx_modify_h(REAL num, REAL *dt_sum, REAL dt_end, REAL *dx, REAL *dx_new, REAL *flag, REAL *successed) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    update_dx_modify_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(num, dt_sum, dt_end, dx, dx_new, flag, successed);

}

__global__ void update_dt_sum_g(REAL *dx, REAL *dt_new, REAL dt_end, REAL *dx_new) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;

    if (s < size_d) {
        REAL tmp0, tmp1, tmp2;

        tmp0 = access_data(dx_new, s);

        tmp1 = access_data(dx, s);

        if (tmp0 < dt_end) {
        //if (tmp0 <= dt_end) {

            tmp2 = tmp0 + tmp1;

            access_data(dt_new, s) = tmp1;

            if (tmp2 >= dt_end) access_data(dx, s) = tmp1 = dt_end - tmp0;

            access_data(dx_new, s) += tmp1;

        } else {

            access_data(dx_new, s) += 1e-10;

        }

    }
}

__host__ void update_dt_sum_h(REAL *dx, REAL *dt_new, REAL dt_end, REAL *dx_new) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    update_dt_sum_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dx, dt_new, dt_end, dx_new);

}

__global__ void judge_flag_g(REAL *flag_reject, REAL result) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d) {

        if (access_data(flag_reject, s) != 0) {
            result = 1.;
        }        
    }
}

__host__ void judge_flag_h(REAL *flag_reject, REAL result){
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    judge_flag_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(flag_reject, result);
}

__global__ void dt_sum_dt_end_g(REAL *dt_sum, REAL dt_end, REAL *c_old, REAL *c_new){
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int w = blockDim.y * blockIdx.y + threadIdx.y;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && w < n_vars) {

        REAL dt_sum_tmp = access_data(dt_sum, s);

        if (dt_sum_tmp <= dt_end) {
            access_vars_data(c_old, w, s) = access_vars_data(c_new, w, s);
        } else {
            access_vars_data(c_new, w, s) = access_vars_data(c_old, w, s);
        }
    }
}

__host__ void dt_sum_dt_end_h(REAL *dt_sum, REAL dt_end, REAL *c_old, REAL *c_new){
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;
    size_t sp_num = species_d_ptr->species_const_d.sp_num;

    dim3 griddim, blockdim;

    set_block_grid2d(size, sp_num+2, block_set_J02d, griddim, blockdim);

    dt_sum_dt_end_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dt_sum, dt_end, c_old, c_new);

}

__global__ void flag_end_g(REAL *dt_sum, REAL dt_end, REAL *flag, REAL *dt_sum_d, REAL t_end_h) {
    unsigned int s = blockDim.x * blockIdx.x + threadIdx.x;
    size_t n_vars = sp_num_d + 2;

    if (s < size_d && access_data(dt_sum_d, s) <= t_end_h) {
        if (access_data(dt_sum, s) < dt_end) {
            *flag = 1.;
        }
    }
}

__host__ void flag_end_h(REAL *dt_sum, REAL dt_end, REAL *flag_d) {
    size_t size = thermoFluid_d_ptr->thermo_ptr_d.size;

    dim3 griddim, blockdim;

    set_block_grid(size, block_set_PLU, griddim, blockdim);

    flag_end_g<<<griddim, blockdim, 0, Stream_opencc[0]>>>(dt_sum, dt_end, flag_d, dt_sum_d, t_end_h);

}