#pragma once #include void synchronize_cuda(); void read_async_cuda(torch::Tensor src, torch::optional optional_offset, torch::optional optional_count, torch::Tensor index, torch::Tensor dst, torch::Tensor buffer); void write_async_cuda(torch::Tensor src, torch::Tensor offset, torch::Tensor count, torch::Tensor dst);