#pragma once

#include <torch/extension.h>

std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
segment_coo_cuda(torch::Tensor src, torch::Tensor index,
                 torch::optional<torch::Tensor> optional_out,
                 torch::optional<int64_t> dim_size, std::string reduce);

torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
                              torch::optional<torch::Tensor> optional_out);
template<typename T>
__device__ T __ldg(const T* ptr) {
    return *ptr;
}
