#pragma once #include "params.h" template void run_flash_mla_combine_kernel(DecodingParams ¶ms, cudaStream_t stream);