/*! \file StackResampler.cu \brief Contains definition of CUDA implementation of a class for spline resampling of irregularly sampled columns in the z-direction \author Jesper Andersson \version 1.0b, March, 2016. */ // // StackResampler.cu // // Jesper Andersson, FMRIB Image Analysis Group // // Copyright (C) 2016 University of Oxford // // Because of a bug in cuda_fp16.hpp, that gets included by cublas_v2.h, it has to // be included before any include files that set up anything related to the std-lib. // If not, there will be an ambiguity in cuda_fp16.hpp about wether to use the // old-style C isinf or the new (since C++11) std::isinf. #include "cublas_v2.h" #include #include #include #include #include #include #include #include #include #include #include #pragma push #pragma diag_suppress = code_is_unreachable // Supress warnings from armawrap #include "armawrap/newmat.h" #include "newimage/newimage.h" #include "EddyHelperClasses.h" #include "CudaVolume.h" #pragma pop #include "EddyCudaHelperFunctions.h" #include "StackResampler.h" #include "EddyKernels.h" #include "EddyMatrixKernels.h" #include "EddyFunctors.h" namespace EDDY { const dim3 StackResampler::_threads_per_block_WtW_StS = dim3(16,16); StackResampler::StackResampler(const EDDY::CudaVolume& stack, const EDDY::CudaVolume& zcoord, const EDDY::CudaVolume& pred, const EDDY::CudaVolume& mask, double lambda) EddyTry : _resvol(stack,false), _mask(stack,false) { // static unsigned int cnt = 0; unsigned int matsz = stack.Size(2); unsigned int nmat = stack.Size(0); // Number of matrices per xz-plane // Allocate memory for storing matrices/vectors on the GPU thrust::device_vector StS_matrix(sqr(matsz)); thrust::device_vector empty_StS_matrix(sqr(matsz)); thrust::device_vector gpu_W_matrix(sqr(matsz)); thrust::device_vector gpu_sorted_zcoord(zcoord.Size()); thrust::device_vector gpu_Wir_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_weights(nmat*matsz); thrust::device_vector gpu_diagw_p_vectors(nmat*matsz); thrust::device_vector gpu_diagw_W_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_Wirty_vectors(nmat*matsz); thrust::device_vector gpu_dWtdp_vectors(nmat*matsz); thrust::device_vector gpu_WtW_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_dWtdW_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_sum_vectors(nmat*matsz); thrust::device_vector gpu_sum_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_c_hat_vectors(nmat*matsz); thrust::device_vector gpu_y_hat_vectors(nmat*matsz); // EDDY::CudaVolume weights = pred; // weights = 0.0; // Get regularisation matrix once and for all //cout << "Making reg matrix" << endl; cout.flush(); get_StS(matsz,lambda,StS_matrix); get_StS(matsz,0.0,empty_StS_matrix); // Get weight matrix for regular grid once and for all //cout << "Making W" << endl; cout.flush(); get_regular_W(matsz,gpu_W_matrix); // Populate mask //cout << "Making mask" << endl; cout.flush(); make_mask(mask,zcoord,true,_mask); // Get a volume of sorted z-ccords //cout << "sorting zcoords" << endl; cout.flush(); sort_zcoords(zcoord,true,gpu_sorted_zcoord); // Interpolate along z-columns for (unsigned int j=0; j()); // Add (pairwise) all Wir'*Wir and (diag{w}W)'*(diag{w}W) //cout << "Making sum matrices" << endl; cout.flush(); thrust::transform(gpu_WtW_matrices.begin(),gpu_WtW_matrices.end(),gpu_dWtdW_matrices.begin(),gpu_sum_matrices.begin(),thrust::plus()); // Solve for spline coefficients //cout << "Making c_hat" << endl; cout.flush(); solve_for_c_hat(gpu_sum_matrices,gpu_sum_vectors,matsz,nmat,true,gpu_c_hat_vectors); // Multiply by W to solve for y_hat (interpolate in z-direction) //cout << "Making y_hat" << endl; cout.flush(); make_y_hat_vectors(gpu_W_matrix,gpu_c_hat_vectors,matsz,nmat,true,gpu_y_hat_vectors); // Transfer solution vectors to result volume //cout << "Transferring y_hat" << endl; cout.flush(); transfer_y_hat_to_volume(gpu_y_hat_vectors,j,true,_resvol); /* if (cnt==19 && j==30) { write_debug_info_for_pred_resampling(54,j,"debug_info.txt",zcoord,stack,pred,gpu_sorted_zcoord, gpu_W_matrix,gpu_Wir_matrices,gpu_weights,gpu_diagw_p_vectors, gpu_diagw_W_matrices,gpu_Wirty_vectors,gpu_dWtdp_vectors, gpu_WtW_matrices,gpu_dWtdW_matrices,gpu_sum_vectors, gpu_sum_matrices,gpu_c_hat_vectors,gpu_y_hat_vectors); } */ } // char fname[256]; sprintf(fname,"weights_%02d",cnt); // weights.Write(std::string(fname)); // cnt++; } EddyCatch StackResampler::StackResampler(const EDDY::CudaVolume& stack, const EDDY::CudaVolume& zcoord, const EDDY::CudaVolume& mask, NEWIMAGE::interpolation interp, double lambda) EddyTry : _resvol(stack,false), _mask(stack,false) { // Populate mask make_mask(mask,zcoord,true,_mask); if (interp == NEWIMAGE::spline) { unsigned int matsz = stack.Size(2); unsigned int nmat = stack.Size(0); // Number of matrices per xz-plane // Allocate memory for storing matrices/vectors on the GPU thrust::device_vector gpu_StS_matrix(sqr(matsz)); thrust::device_vector gpu_W_matrix(sqr(matsz)); thrust::device_vector gpu_Wir_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_WtW_StS_matrices(nmat*sqr(matsz)); thrust::device_vector gpu_Wirty_vectors(nmat*matsz); thrust::device_vector gpu_c_hat_vectors(nmat*matsz); thrust::device_vector gpu_y_hat_vectors(nmat*matsz); // Get regularisation matrix once and for all get_StS(matsz,lambda,gpu_StS_matrix); // Get spline matrix for regular grid once and for all get_regular_W(matsz,gpu_W_matrix); // Interpolate along z-columns for (unsigned int j=0; j gpu_sorted_zcoord(zcoord.Size()); thrust::device_vector gpu_sorted_intensities(zcoord.Size()); thrust::device_vector gpu_interpolated_columns(zcoord.Size(),0.0); // Sort the z-ccordinates and the intensities with z-coord as key sort_zcoords_and_intensities(zcoord,stack,true,gpu_sorted_zcoord,gpu_sorted_intensities); // Do linear interpolation along z-columns linear_interpolate_columns(gpu_sorted_zcoord,gpu_sorted_intensities,zcoord.Size(0),zcoord.Size(1),zcoord.Size(2),true,gpu_interpolated_columns); // Transfer interpolated vectors to results volume transfer_interpolated_columns_to_volume(gpu_interpolated_columns,true,_resvol); } else throw EddyException("StackResampler::StackResampler: Invalid interpolation method"); } EddyCatch void StackResampler::get_StS(unsigned int sz, double lambda, thrust::device_vector& StS) const EddyTry { float six = lambda * 6.0; float minusfour = - lambda * 4.0; float one = lambda; thrust::host_vector hStS(StS.size(),0.0); hStS[rfindx(0,0,sz)] = six; hStS[rfindx(0,1,sz)] = minusfour; hStS[rfindx(0,2,sz)] = one; hStS[rfindx(0,(sz-2),sz)] = one; hStS[rfindx(0,(sz-1),sz)] = minusfour; hStS[rfindx(1,0,sz)] = minusfour; hStS[rfindx(1,1,sz)] = six; hStS[rfindx(1,2,sz)] = minusfour; hStS[rfindx(1,3,sz)] = one; hStS[rfindx(1,(sz-1),sz)] = one; for ( unsigned int i=2; i<(sz-2); i++) { hStS[rfindx(i,i-2,sz)] = one; hStS[rfindx(i,i-1,sz)] = minusfour; hStS[rfindx(i,i,sz)] = six; hStS[rfindx(i,i+1,sz)] = minusfour; hStS[rfindx(i,i+2,sz)] = one; } hStS[rfindx((sz-2),0,sz)] = one; hStS[rfindx((sz-2),(sz-4),sz)] = one; hStS[rfindx((sz-2),(sz-3),sz)] = minusfour; hStS[rfindx((sz-2),(sz-2),sz)] = six; hStS[rfindx((sz-2),(sz-1),sz)] = minusfour; hStS[rfindx((sz-1),0,sz)] = minusfour; hStS[rfindx((sz-1),1,sz)] = one; hStS[rfindx((sz-1),(sz-3),sz)] = one; hStS[rfindx((sz-1),(sz-2),sz)] = minusfour; hStS[rfindx((sz-1),(sz-1),sz)] = six; StS = hStS; return; } EddyCatch void StackResampler::get_regular_W(unsigned int sz, thrust::device_vector& W) const EddyTry { thrust::host_vector hW(W.size(),0.0); hW[rfindx(0,0,sz)] = 5.0/6.0; hW[rfindx(0,1,sz)] = 1.0/6.0; for ( unsigned int i=1; i<(sz-1); i++) { hW[rfindx(i,i-1,sz)] = 1.0/6.0; hW[rfindx(i,i,sz)] = 4.0/6.0; hW[rfindx(i,i+1,sz)] = 1.0/6.0; } hW[rfindx((sz-1),(sz-2),sz)] = 1.0/6.0; hW[rfindx((sz-1),(sz-1),sz)] = 5.0/6.0; W = hW; return; } EddyCatch void StackResampler::make_mask(const EDDY::CudaVolume& inmask, const EDDY::CudaVolume& zcoord, bool sync, EDDY::CudaVolume& omask) EddyTry { omask = 0.0; int nblocks = inmask.Size(1); int tpb = inmask.Size(0); EddyKernels::make_mask_from_stack<<>>(inmask.GetPtr(),zcoord.GetPtr(),inmask.Size(0), inmask.Size(1),inmask.Size(2),omask.GetPtr()); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::make_mask_from_stack"); } EddyCatch void StackResampler::sort_zcoords(const EDDY::CudaVolume& zcoord, bool sync, thrust::device_vector& szcoord) const EddyTry { // First transfer all columns and check if they are sorted thrust::device_vector ns_flags(zcoord.Size(0)*zcoord.Size(1),0); // Non-zero if a column is "not sorted" EddyKernels::TransferAndCheckSorting<<>>(zcoord.GetPtr(),zcoord.Size(0), zcoord.Size(1),zcoord.Size(2), thrust::raw_pointer_cast(szcoord.data()), thrust::raw_pointer_cast(ns_flags.data())); EddyCudaHelperFunctions::CudaSync("EddyKernels::TransferAndCheckSorting"); unsigned int nnsort = thrust::reduce(ns_flags.begin(),ns_flags.end(),0,EDDY::Sum()); if (nnsort) { // If there are columns that are not sorted thrust::host_vector host_ns_flags = ns_flags; thrust::host_vector host_nsort_indx(nnsort,0); for (unsigned int i=0, n=0; i device_nsort_indx = host_nsort_indx; int nb = (nnsort / 32) + 1; EddyKernels::SortVectors<<>>(thrust::raw_pointer_cast(device_nsort_indx.data()),nnsort, zcoord.Size(2),thrust::raw_pointer_cast(szcoord.data()),NULL); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::SortVectors"); /* // Check that vectors really are sorted. Only for use during testing. thrust::host_vector hvec = szcoord; for (unsigned int vi=0; vi>>(zcoord.GetPtr(),zcoord.Size(0), zcoord.Size(1),zcoord.Size(2), thrust::raw_pointer_cast(szcoord.data()), thrust::raw_pointer_cast(ns_flags.data())); EddyCudaHelperFunctions::CudaSync("EddyKernels::TransferAndCheckSorting"); unsigned int nnsort = thrust::reduce(ns_flags.begin(),ns_flags.end(),0,EDDY::Sum()); // Transfer all intensity values as well EddyKernels::TransferVolumeToVectors<<>>(data.GetPtr(),data.Size(0), data.Size(1),data.Size(2), thrust::raw_pointer_cast(sdata.data())); if (nnsort) { // If there are columns that are not sorted thrust::host_vector host_ns_flags = ns_flags; thrust::host_vector host_nsort_indx(nnsort,0); for (unsigned int i=0, n=0; i device_nsort_indx = host_nsort_indx; int nb = (nnsort / 32) + 1; EddyKernels::SortVectors<<>>(thrust::raw_pointer_cast(device_nsort_indx.data()),nnsort, zcoord.Size(2),thrust::raw_pointer_cast(szcoord.data()), thrust::raw_pointer_cast(sdata.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::SortVectors"); } } EddyCatch void StackResampler::linear_interpolate_columns(const thrust::device_vector& zcoord, const thrust::device_vector& val, unsigned int xsz, unsigned int ysz, unsigned int zsz, bool sync, thrust::device_vector& ival) const EddyTry { EddyKernels::LinearInterpolate<<>>(thrust::raw_pointer_cast(zcoord.data()), thrust::raw_pointer_cast(val.data()),zsz, thrust::raw_pointer_cast(ival.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::LinearInterpolate"); } EddyCatch void StackResampler::transfer_interpolated_columns_to_volume(const thrust::device_vector& zcols, bool sync, EDDY::CudaVolume& vol) EddyTry { EddyKernels::TransferColumnsToVolume<<>>(thrust::raw_pointer_cast(zcols.data()), vol.Size(0),vol.Size(1), vol.Size(2),vol.GetPtr()); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::TransferColumnsToVolume"); } EddyCatch void StackResampler::make_weight_vectors(const thrust::device_vector& zcoord, unsigned int xsz, unsigned int zsz, unsigned int xzp, bool sync, thrust::device_vector& weights) const EddyTry { EddyKernels::MakeWeights<<>>(thrust::raw_pointer_cast(zcoord.data()),xsz, zsz,xzp,thrust::raw_pointer_cast(weights.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::MakeWeights"); } EddyCatch void StackResampler::insert_weights(const thrust::device_vector& wvec, unsigned int j, bool sync, EDDY::CudaVolume& wvol) const EddyTry { EddyKernels::InsertWeights<<>>(thrust::raw_pointer_cast(wvec.data()),j,wvol.Size(0), wvol.Size(1),wvol.Size(2),wvol.GetPtr()); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::InsertWeights"); } EddyCatch void StackResampler::make_diagw_p_vectors(const EDDY::CudaVolume& pred, const thrust::device_vector& wgts, unsigned int xzp, bool sync, thrust::device_vector& wp) const EddyTry { EddyKernels::MakeDiagwpVecs<<>>(pred.GetPtr(),thrust::raw_pointer_cast(wgts.data()), pred.Size(0),pred.Size(1),pred.Size(2),xzp, thrust::raw_pointer_cast(wp.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyKernels::MakeDiagwpVecs"); } EddyCatch void StackResampler::make_diagw_W_matrices(const thrust::device_vector& wgts, const thrust::device_vector& W, unsigned int matsz, unsigned int nmat, bool sync, thrust::device_vector& diagwW) const EddyTry { EddyMatrixKernels::DiagwA<<>>(thrust::raw_pointer_cast(wgts.data()), thrust::raw_pointer_cast(W.data()), matsz,matsz,nmat, thrust::raw_pointer_cast(diagwW.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyMatrixKernels::DiagwA"); } EddyCatch void StackResampler::make_dwWt_dwp_vectors(const thrust::device_vector& dW, const thrust::device_vector& dp, unsigned int matsz, unsigned int nmat, bool sync, thrust::device_vector& dWtdp) const EddyTry { EddyMatrixKernels::Atb<<>>(thrust::raw_pointer_cast(dW.data()), thrust::raw_pointer_cast(dp.data()), matsz,matsz,nmat,nmat, thrust::raw_pointer_cast(dWtdp.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyMatrixKernels::Atb"); return; } EddyCatch void StackResampler::make_Wir_matrices(const EDDY::CudaVolume& zcoord, unsigned int xzp, bool sync, thrust::device_vector& Wir) const EddyTry { int tpb = _threads_per_block_Wir; EddyMatrixKernels::Wir<<>>(zcoord.GetPtr(),zcoord.Size(0),zcoord.Size(1), zcoord.Size(2),zcoord.Size(0),xzp, thrust::raw_pointer_cast(Wir.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyMatrixKernels::Wir"); } EddyCatch void StackResampler::make_Wir_t_y_vectors(const EDDY::CudaVolume& y, const thrust::device_vector& Wir, unsigned int xzp, bool sync, thrust::device_vector& Wirty) const EddyTry { int tpb = _threads_per_block_Wirty; EddyMatrixKernels::Wirty<<>>(y.GetPtr(),thrust::raw_pointer_cast(Wir.data()), y.Size(0),y.Size(1),y.Size(2),y.Size(0),xzp, thrust::raw_pointer_cast(Wirty.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyMatrixKernels::Wirty"); } EddyCatch void StackResampler::make_WtW_StS_matrices(const thrust::device_vector& Wir, unsigned int mn, unsigned int nmat, const thrust::device_vector& StS, bool sync, thrust::device_vector& WtW) const EddyTry { dim3 block = _threads_per_block_WtW_StS; EddyMatrixKernels::KtK<<>>(thrust::raw_pointer_cast(Wir.data()),mn,mn,nmat, thrust::raw_pointer_cast(StS.data()),1.0,true, thrust::raw_pointer_cast(WtW.data())); if (sync) EddyCudaHelperFunctions::CudaSync("KtK_Kernels::KtK"); return; } EddyCatch void StackResampler::solve_for_c_hat(// Input const thrust::device_vector& WtW, // WtW+StS matrices for one xz-plane. const thrust::device_vector& Wty, // Wty vectors for one xz-plane unsigned int n, // Size of KtK (nxn) unsigned int nmat, // Number of matrices for one xz-plane bool sync, // If true syncs after submitting kernel // Output thrust::device_vector& chat) const EddyTry // Returns inv(WtW)*Wty for all matrices in xz-plane { // Allocate memory for Q and R matrices for QR decomposition thrust::device_vector Qt(nmat*n*n); thrust::device_vector R(nmat*n*n); // Dynamically allocated shared memory, per block (matrix) size_t sh_mem_sz = 2*n*sizeof(float); int tpb = _threads_per_block_QR; EddyMatrixKernels::QR<<>>(thrust::raw_pointer_cast(WtW.data()),n,n,nmat, thrust::raw_pointer_cast(Qt.data()), thrust::raw_pointer_cast(R.data())); if (sync) EddyCudaHelperFunctions::CudaSync("QR_Kernels::QR"); tpb = _threads_per_block_Solve; EddyMatrixKernels::Solve<<>>(thrust::raw_pointer_cast(Qt.data()), thrust::raw_pointer_cast(R.data()), thrust::raw_pointer_cast(Wty.data()),n,n,nmat, thrust::raw_pointer_cast(chat.data())); if (sync) EddyCudaHelperFunctions::CudaSync("QR_Kernels::Solve"); return; } EddyCatch void StackResampler::make_y_hat_vectors(// Input const thrust::device_vector& W, const thrust::device_vector& chat, unsigned int mn, unsigned int nvec, bool sync, // Output thrust::device_vector& yhat) const EddyTry { int tpb = _threads_per_block_yhat; EddyMatrixKernels::Ab<<>>(thrust::raw_pointer_cast(W.data()), thrust::raw_pointer_cast(chat.data()), mn,mn,1,nvec,thrust::raw_pointer_cast(yhat.data())); if (sync) EddyCudaHelperFunctions::CudaSync("EddyMatrixKernels::Ab"); return; } EddyCatch void StackResampler::transfer_y_hat_to_volume(// Input const thrust::device_vector& yhat, unsigned int xzp, bool sync, // Output EDDY::CudaVolume& ovol) const EddyTry { int tpb = _threads_per_block_transfer; int nblocks = (ovol.Size(0)%tpb) ? ovol.Size(0) / tpb + 1 : ovol.Size(0) / tpb; EddyKernels::transfer_y_hat_to_volume<<>>(thrust::raw_pointer_cast(yhat.data()),ovol.Size(0), ovol.Size(1),ovol.Size(2),xzp,ovol.GetPtr()); if (sync) EddyCudaHelperFunctions::CudaSync("EddyMatrixKernels::transfer_y_hat_to_volume"); return; } EddyCatch void StackResampler::write_matrix(const thrust::device_vector& mats, unsigned int offs, unsigned int m, unsigned int n, const std::string& fname) const EddyTry { thrust::device_vector::const_iterator first = mats.begin() + offs; thrust::device_vector::const_iterator last = mats.begin() + offs + m*n; thrust::host_vector mat(first,last); NEWMAT::Matrix newmat(m,n); for (unsigned int i=0; i& mats, unsigned int nmat, unsigned int m, unsigned int n, const std::string& basefname) const EddyTry { char fname[256]; for (unsigned int f=0; f& sz, const thrust::device_vector& W, const thrust::device_vector& Wir, const thrust::device_vector& w, const thrust::device_vector& wp, const thrust::device_vector& wW, const thrust::device_vector& Wirtg, const thrust::device_vector& wWtwp, const thrust::device_vector& WirtWir, const thrust::device_vector& wWtwW, const thrust::device_vector& sum_vec, const thrust::device_vector& sum_mat, const thrust::device_vector& c_hat, const thrust::device_vector& y_hat) const EddyTry { unsigned int xs = z.Size(0); unsigned int ys = z.Size(1); unsigned int zs = z.Size(2); NEWIMAGE::volume tmpvol = z.GetVolume(); NEWMAT::ColumnVector tmpvec(zs); for (unsigned int k=0; k