#pragma once
#include "../extensions.h"
torch::Tensor non_diag_mask_cpu(torch::Tensor row, torch::Tensor col, int64_t M, int64_t N, int64_t k);