#pragma once #include std::tuple> segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, torch::optional optional_out, std::string reduce); torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr, torch::optional optional_out);