#pragma once #include torch::Tensor partition_kway_cpu(torch::Tensor rowptr, torch::Tensor col, int64_t num_parts);