UPDATE

ffeba11a · mayp777 · 29deb085 · ffeba11a · ffeba11a · ffeba11a
Commit ffeba11a authored Sep 02, 2024 by mayp777
20 changed files
--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
-# the following line is added in order to export symbols when building on Windows
-# this approach has some limitations as documented in https://github.com/pytorch/pytorch/pull/3650
-if (MSVC)
-  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-endif()
 ################################################################################
 # libtorchaudio
 ################################################################################
 set(
-  LIBTORCHAUDIO_SOURCES
+  sources
  lfilter.cpp
  overdrive.cpp
  utils.cpp
  )
 set(
-  LIBTORCHAUDIO_INCLUDE_DIRS
+  additional_libs
-  ${PROJECT_SOURCE_DIR}
  )
 set(
-  LIBTORCHAUDIO_LINK_LIBRARIES
+  compile_definitions)
-  torch
-  )
-set(
-  LIBTORCHAUDIO_COMPILE_DEFINITIONS)
 #------------------------------------------------------------------------------#
 # START OF CUSTOMIZATION LOGICS
@@ -33,7 +21,7 @@ set(
 if(BUILD_RNNT)
  list(
    APPEND
-    LIBTORCHAUDIO_SOURCES
+    sources
    rnnt/cpu/compute_alphas.cpp
    rnnt/cpu/compute_betas.cpp
    rnnt/cpu/compute.cpp
@@ -45,16 +33,18 @@ if(BUILD_RNNT)
  if (USE_CUDA)
    list(
      APPEND
-      LIBTORCHAUDIO_SOURCES
+      sources
      rnnt/gpu/compute_alphas.cu
      rnnt/gpu/compute_betas.cu
      rnnt/gpu/compute.cu
      )
  endif()
  if (USE_ROCM)
+    set (CMAKE_C_COMPILER "hipcc")
+    set (CMAKE_CXX_COMPILER "hipcc")
    list(
      APPEND
-      LIBTORCHAUDIO_SOURCES
+      sources
      rnnt/dcu/compute_alphas.cpp
      rnnt/dcu/compute_betas.cpp
      rnnt/dcu/compute.cpp
@@ -63,98 +53,59 @@ if(BUILD_RNNT)
 endif()
-if(USE_CUDA)
+if(BUILD_RIR)
-  list(
+  list(APPEND sources rir.cpp)
-    APPEND
+  list(APPEND compile_definitions INCLUDE_RIR)
-    LIBTORCHAUDIO_INCLUDE_DIRS
-    ${CUDA_TOOLKIT_INCLUDE}
-    )
-  list(
-    APPEND
-    LIBTORCHAUDIO_LINK_LIBRARIES
-    ${C10_CUDA_LIBRARY}
-    ${CUDA_CUDART_LIBRARY}
-    )
-  list(
-    APPEND
-    LIBTORCHAUDIO_COMPILE_DEFINITIONS
-    USE_CUDA
-  )
 endif()
-if(USE_ROCM)
+if(BUILD_ALIGN)
-  list(
-    APPEND
-    LIBTORCHAUDIO_INCLUDE_DIRS
-    ${CUDA_TOOLKIT_INCLUDE}
-    )
-  list(
-    APPEND
-    LIBTORCHAUDIO_LINK_LIBRARIES
-    ${C10_CUDA_LIBRARY}
-    ${CUDA_CUDART_LIBRARY}
-    )
  list(
    APPEND
-    LIBTORCHAUDIO_COMPILE_DEFINITIONS
+    sources
-    USE_ROCM
+    forced_align/compute.cpp
+    forced_align/cpu/compute.cpp
  )
+  list(APPEND compile_definitions INCLUDE_ALIGN)
+  if (USE_CUDA)
+    list(
+      APPEND
+      sources
+      forced_align/gpu/compute.cu
+    )
+  endif()
 endif()
-if(BUILD_KALDI)
+if(USE_CUDA)
-  list(APPEND LIBTORCHAUDIO_LINK_LIBRARIES kaldi)
-  list(APPEND LIBTORCHAUDIO_SOURCES kaldi.cpp)
-  list(APPEND LIBTORCHAUDIO_COMPILE_DEFINITIONS INCLUDE_KALDI)
-endif()
-if(BUILD_SOX)
-  list(
-    APPEND
-    LIBTORCHAUDIO_LINK_LIBRARIES
-    libsox
-    )
  list(
    APPEND
-    LIBTORCHAUDIO_SOURCES
+    sources
-    sox/io.cpp
+    iir_cuda.cu
-    sox/utils.cpp
+  )
-    sox/effects.cpp
-    sox/effects_chain.cpp
-    sox/types.cpp
-    )
  list(
    APPEND
-    LIBTORCHAUDIO_COMPILE_DEFINITIONS
+    additional_libs
-    INCLUDE_SOX
+    cuda_deps
    )
 endif()
 if(OpenMP_CXX_FOUND)
  list(
    APPEND
-    LIBTORCHAUDIO_LINK_LIBRARIES
+    additional_libs
    OpenMP::OpenMP_CXX
    )
 endif()
-if(USE_FFMPEG)
-  list(
-    APPEND
-    LIBTORCHAUDIO_COMPILE_DEFINITIONS
-    USE_FFMPEG
-    )
-endif()
 #------------------------------------------------------------------------------#
 # END OF CUSTOMIZATION LOGICS
 #------------------------------------------------------------------------------#
 torchaudio_library(
  libtorchaudio
-  "${LIBTORCHAUDIO_SOURCES}"
+  "${sources}"
-  "${LIBTORCHAUDIO_INCLUDE_DIRS}"
+  ""
-  "${LIBTORCHAUDIO_LINK_LIBRARIES}"
+  "torch;${additional_libs}"
-  "${LIBTORCHAUDIO_COMPILE_DEFINITIONS}"
+  "${compile_definitions}"
  )
 if (APPLE)
@@ -164,83 +115,18 @@ else()
 endif()
 ################################################################################
-# libtorchaudio_ffmpeg
+# Python extensions
-################################################################################
-if(USE_FFMPEG)
-  set(
-    LIBTORCHAUDIO_FFMPEG_SOURCES
-    ffmpeg/ffmpeg.cpp
-    ffmpeg/filter_graph.cpp
-    ffmpeg/stream_reader/buffer.cpp
-    ffmpeg/stream_reader/decoder.cpp
-    ffmpeg/stream_reader/sink.cpp
-    ffmpeg/stream_reader/stream_processor.cpp
-    ffmpeg/stream_reader/stream_reader.cpp
-    ffmpeg/stream_reader/stream_reader_wrapper.cpp
-    ffmpeg/stream_reader/stream_reader_binding.cpp
-    ffmpeg/stream_reader/stream_reader_tensor_binding.cpp
-    ffmpeg/stream_writer/stream_writer.cpp
-    ffmpeg/stream_writer/stream_writer_wrapper.cpp
-    ffmpeg/stream_writer/stream_writer_binding.cpp
-    ffmpeg/utils.cpp
-    )
-  message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
-  find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)
-  torchaudio_library(
-    libtorchaudio_ffmpeg
-    "${LIBTORCHAUDIO_FFMPEG_SOURCES}"
-    "${LIBTORCHAUDIO_INCLUDE_DIRS};${FFMPEG_INCLUDE_DIRS}"
-    "torch;${FFMPEG_LIBRARIES}"
-    "${LIBTORCHAUDIO_COMPILE_DEFINITIONS}"
-  )
-endif()
-################################################################################
-# TODO: Rename this to _torchaudio_sox.so
-# _torchaudio.so
 ################################################################################
 if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
  set(
-    EXTENSION_SOURCES
+    extension_sources
-    sox/pybind/pybind.cpp
+    pybind/pybind.cpp
    )
-  #----------------------------------------------------------------------------#
-  # START OF CUSTOMIZATION LOGICS
-  #----------------------------------------------------------------------------#
-  if(BUILD_SOX)
-    list(
-      APPEND
-      EXTENSION_SOURCES
-      sox/pybind/effects.cpp
-      sox/pybind/effects_chain.cpp
-      sox/pybind/io.cpp
-      sox/pybind/utils.cpp
-      )
-  endif()
-  #----------------------------------------------------------------------------#
-  # END OF CUSTOMIZATION LOGICS
-  #----------------------------------------------------------------------------#
  torchaudio_extension(
    _torchaudio
-    "${EXTENSION_SOURCES}"
+    "${extension_sources}"
+    ""
+    "libtorchaudio"
    ""
-    libtorchaudio
-    "${LIBTORCHAUDIO_COMPILE_DEFINITIONS}"
    )
-  if(USE_FFMPEG)
-    set(
-      FFMPEG_EXTENSION_SOURCES
-      ffmpeg/pybind/typedefs.cpp
-      ffmpeg/pybind/pybind.cpp
-      ffmpeg/pybind/stream_reader.cpp
-      ffmpeg/pybind/stream_writer.cpp
-      )
-    torchaudio_extension(
-      _torchaudio_ffmpeg
-      "${FFMPEG_EXTENSION_SOURCES}"
-      "${FFMPEG_INCLUDE_DIRS}"
-      "libtorchaudio_ffmpeg"
-      ""
-      )
-  endif()
 endif()
--- a/torchaudio/csrc/cuctc/CMakeLists.txt
+++ b/torchaudio/csrc/cuctc/CMakeLists.txt
+# Custom CMakeLists for building cuda ctc decoder
+set(CMAKE_CXX_VISIBILITY_PRESET default)
+# the following line is added in order to export symbols when building on Windows
+# this approach has some limitations as documented in https://github.com/pytorch/pytorch/pull/3650
+if (MSVC)
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+set(
+  libctc_prefix_decoder_src
+  src/ctc_prefix_decoder.cpp
+  src/ctc_prefix_decoder_kernel_v2.cu
+  )
+set(
+  additional_libs
+  )
+list(
+  APPEND
+  additional_libs
+  cuda_deps
+  )
+torchaudio_library(
+  libctc_prefix_decoder
+  "${libctc_prefix_decoder_src}"
+  "${CMAKE_CURRENT_SOURCE_DIR}"
+  "${additional_libs}"
+  ""
+  )
+if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
+  torchaudio_extension(
+    pybind11_prefixctc
+    src/python_binding.cpp
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+    "libctc_prefix_decoder;${additional_libs}"
+    ""
+    )
+endif()
--- a/torchaudio/csrc/cuctc/LICENSE
+++ b/torchaudio/csrc/cuctc/LICENSE
+BSD 2-Clause License
+Copyright (c) 2023 Nvidia 
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/torchaudio/csrc/cuctc/include/ctc_prefix_decoder.h
+++ b/torchaudio/csrc/cuctc/include/ctc_prefix_decoder.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#ifndef __ctc_prefix_decoder_h_
+#define __ctc_prefix_decoder_h_
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+namespace cu_ctc {
+struct InternalData;
+std::uintptr_t prefixCTC_alloc(std::uintptr_t stream_ptr);
+void prefixCTC_free(std::uintptr_t inter_data_ptr);
+std::tuple<size_t, int> calculate_require_buff_and_init_internal_data(
+    InternalData* inter_data,
+    int batch_size,
+    int seq_len,
+    int vocab_size,
+    int beam,
+    std::uintptr_t buff_ptr,
+    size_t buff_size,
+    float* log_prob_data_ptr,
+    int* original_lens,
+    const std::vector<int>& prob_sizes,
+    const std::vector<int>& prob_strides,
+    int blid,
+    float threshold);
+int ctc_beam_search_decoder_batch_gpu(
+    InternalData* inter_data,
+    float* pp,
+    int blid,
+    int spid,
+    int* clist,
+    int* clen,
+    float* score);
+} // namespace cu_ctc
+#endif
--- a/torchaudio/csrc/cuctc/include/ctc_prefix_decoder_host.h
+++ b/torchaudio/csrc/cuctc/include/ctc_prefix_decoder_host.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#ifndef __ctc_prefix_decoder_host_h_
+#define __ctc_prefix_decoder_host_h_
+#include <cuda_runtime.h>
+#define CUDA_CHECK(X)                                   \
+  do {                                                  \
+    auto result = X;                                    \
+    if (result != cudaSuccess) {                        \
+      const char* p_err_str = cudaGetErrorName(result); \
+      fprintf(                                          \
+          stderr,                                       \
+          "File %s Line %d %s returned %s.\n",          \
+          __FILE__,                                     \
+          __LINE__,                                     \
+          #X,                                           \
+          p_err_str);                                   \
+      abort();                                          \
+    }                                                   \
+  } while (0)
+#define CHECK(X, ERROR_INFO)                        \
+  do {                                              \
+    auto result = (X);                              \
+    if (!result) {                                  \
+      fprintf(                                      \
+          stderr,                                   \
+          " File %s Line %d %s ERROR_INFO: %s .\n", \
+          __FILE__,                                 \
+          __LINE__,                                 \
+          #X,                                       \
+          ERROR_INFO);                              \
+      abort();                                      \
+    }                                               \
+  } while (0)
+namespace cu_ctc {
+struct LogProb;
+int init_log_prob_and_cal_max_select_seq_len(
+    LogProb* log_prob_struct,
+    int blid,
+    float threshold,
+    cudaStream_t stream);
+int CTC_prob_matrix_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* clast,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int bs,
+    int blid,
+    int spid,
+    cudaStream_t stream);
+int CTC_prob_merge_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float* ptable,
+    float* ptablen,
+    int* ptid,
+    int* clast,
+    int* clist,
+    int* clen,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int bs,
+    cudaStream_t stream,
+    int blid);
+int CTC_prob_first_step_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float2* pprev,
+    int* ptid,
+    int* clast,
+    int* clen,
+    int* clist,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int bs,
+    float* score,
+    cudaStream_t stream,
+    int blid);
+int CTC_prob_topK_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* ptid,
+    int* clast,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int blid,
+    int bs,
+    float* score,
+    float* topk_key_buff,
+    int* topk_value_buff,
+    cudaStream_t stream,
+    bool is_last_step);
+int CTC_copy_list_len_for_differnet_parity(
+    LogProb* log_prob_struct,
+    int step,
+    int max_select_seq_len,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int bs,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    cudaStream_t stream);
+} // namespace cu_ctc
+#endif
--- a/torchaudio/csrc/cuctc/src/bitonic_topk/LICENSE
+++ b/torchaudio/csrc/cuctc/src/bitonic_topk/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 NVIDIA Corporation
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/torchaudio/csrc/cuctc/src/bitonic_topk/bitonic_sort.cuh
+++ b/torchaudio/csrc/cuctc/src/bitonic_topk/bitonic_sort.cuh
+/**
+ *  Modified from Rapidsai/raft(https://github.com/rapidsai/raft)
+ *
+ */
+#pragma once
+#include <cstdint>
+namespace cu_ctc {
+namespace topk {
+static constexpr int WarpSize = 32;
+template <typename IntType>
+constexpr inline __host__ __device__ bool isPo2(IntType num) {
+  return (num && !(num & (num - 1)));
+}
+inline __device__ int laneId() {
+  int id;
+  asm("mov.s32 %0, %%laneid;" : "=r"(id));
+  return id;
+}
+/**
+ * @brief Shuffle the data inside a warp
+ * @tparam T the data type (currently assumed to be 4B)
+ * @param val value to be shuffled
+ * @param laneMask mask to be applied in order to perform xor shuffle
+ * @param width lane width
+ * @param mask mask of participating threads (Volta+)
+ * @return the shuffled data
+ */
+template <typename T>
+inline __device__ T shfl_xor(
+    T val,
+    int laneMask,
+    int width = WarpSize,
+    uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, val, laneMask, width);
+#else
+  return __shfl_xor(val, laneMask, width);
+#endif
+}
+/**
+ * @brief Shuffle the data inside a warp
+ * @tparam T the data type (currently assumed to be 4B)
+ * @param val value to be shuffled
+ * @param srcLane lane from where to shuffle
+ * @param width lane width
+ * @param mask mask of participating threads (Volta+)
+ * @return the shuffled data
+ */
+template <typename T>
+inline __device__ T
+shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  return __shfl_sync(mask, val, srcLane, width);
+#else
+  return __shfl(val, srcLane, width);
+#endif
+}
+/** warp-wide any boolean aggregator */
+inline __device__ bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
+#if CUDART_VERSION >= 9000
+  inFlag = __any_sync(mask, inFlag);
+#else
+  inFlag = __any(inFlag);
+#endif
+  return inFlag;
+}
+template <typename T>
+constexpr T lower_bound() {
+  if constexpr (
+      std::numeric_limits<T>::has_infinity &&
+      std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+template <typename T>
+constexpr T upper_bound() {
+  if constexpr (std::numeric_limits<T>::has_infinity) {
+    return std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::max();
+}
+namespace helpers {
+template <typename T>
+__device__ __forceinline__ void swap(T& x, T& y) {
+  T t = x;
+  x = y;
+  y = t;
+}
+template <typename T>
+__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x) {
+  if (cond) {
+    ptr = x;
+  }
+}
+} // namespace helpers
+/**
+ * Warp-wide bitonic merge and sort.
+ * The data is strided among `warp_width` threads,
+ * e.g. calling `bitonic<4>(ascending=true).sort(arr)` takes a unique 4-element
+ * array as input of each thread in a warp and sorts them, such that for a fixed
+ * i, arr[i] are sorted within the threads in a warp, and for any i < j, arr[j]
+ * in any thread is not smaller than arr[i] in any other thread. When
+ * `warp_width < WarpSize`, the data is sorted within all subwarps of the warp
+ * independently.
+ *
+ * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`,
+ * sorting a permutation of numbers 0-63 in each subwarp yield the following
+ * result:
+ * `
+ *  arr_i \ laneId()
+ *       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15    16  17
+ * 18 ... subwarp_1 subwarp_2 0   0   1   2   3   4   5   6   7   8   9  10  11
+ * 12  13  14  15     0   1   2 ... 1  16  17  18  19  20  21  22  23  24  25 26
+ * 27  28  29  30  31    16  17  18 ... 2  32  33  34  35  36  37  38  39  40 41
+ * 42  43  44  45  46  47    32  33  34 ... 3  48  49  50  51  52  53  54  55 56
+ * 57  58  59  60  61  62  63    48  49  50 ...
+ * `
+ *
+ * @tparam Size
+ *   number of elements processed in each thread;
+ *   i.e. the total data size is `Size * warp_width`.
+ *   Must be power-of-two.
+ *
+ */
+template <int Size = 1>
+class bitonic {
+  static_assert(
+      isPo2(Size),
+      "class bitonic<Size> , size should be power of 2 \n");
+ public:
+  /**
+   * Initialize bitonic sort config.
+   *
+   * @param ascending
+   *   the resulting order (true: ascending, false: descending).
+   * @param warp_width
+   *   the number of threads participating in the warp-level primitives;
+   *   the total size of the sorted data is `Size * warp_width`.
+   *   Must be power-of-two, not larger than the WarpSize.
+   */
+  __device__ __forceinline__ explicit bitonic(
+      bool ascending,
+      int warp_width = WarpSize)
+      : ascending_(ascending), warp_width_(warp_width) {}
+  bitonic(bitonic const&) = delete;
+  bitonic(bitonic&&) = delete;
+  auto operator=(bitonic const&) -> bitonic& = delete;
+  auto operator=(bitonic&&) -> bitonic& = delete;
+  /**
+   * You can think of this function in two ways:
+   *
+   *   1) Sort any bitonic sequence.
+   *   2) Merge two halfs of the input data assuming they're already sorted, and
+   * their order is opposite (i.e. either ascending, descending or vice-versa).
+   *
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread;
+   * must be at least `Size` elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are
+   * sorted together with the keys; must be at least `Size` elements long.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  __device__ __forceinline__ void merge(
+      KeyT* __restrict__ keys,
+      PayloadTs* __restrict__... payloads) const {
+    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+  }
+  /**
+   * Sort the data.
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread;
+   * must be at least `Size` elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are
+   * sorted together with the keys; must be at least `Size` elements long.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  __device__ __forceinline__ void sort(
+      KeyT* __restrict__ keys,
+      PayloadTs* __restrict__... payloads) const {
+    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+  }
+  /**
+   * @brief `merge` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto merge(
+      KeyT& __restrict__ key,
+      PayloadTs& __restrict__... payload) const
+      -> std::enable_if_t<S == 1, void> // SFINAE to enable this for Size == 1
+                                        // only
+  {
+    static_assert(S == Size);
+    return merge(&key, &payload...);
+  }
+  /**
+   * @brief `sort` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto sort(
+      KeyT& __restrict__ key,
+      PayloadTs& __restrict__... payload) const
+      -> std::enable_if_t<S == 1, void> // SFINAE to enable this for Size == 1
+                                        // only
+  {
+    static_assert(S == Size);
+    return sort(&key, &payload...);
+  }
+ private:
+  const int warp_width_;
+  const bool ascending_;
+  template <int AnotherSize>
+  friend class bitonic;
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void merge_(
+      bool ascending,
+      int warp_width,
+      KeyT* __restrict__ keys,
+      PayloadTs* __restrict__... payloads) {
+#pragma unroll
+    for (int size = Size; size > 1; size >>= 1) {
+      const int stride = size >> 1;
+#pragma unroll
+      for (int offset = 0; offset < Size; offset += size) {
+#pragma unroll
+        for (int i = offset + stride - 1; i >= offset; i--) {
+          const int other_i = i + stride;
+          KeyT& key = keys[i];
+          KeyT& other = keys[other_i];
+          if (ascending ? key > other : key < other) {
+            helpers::swap(key, other);
+            (helpers::swap(payloads[i], payloads[other_i]), ...);
+          }
+        }
+      }
+    }
+    const int lane = laneId();
+#pragma unroll
+    for (int i = 0; i < Size; i++) {
+      KeyT& key = keys[i];
+      for (int stride = (warp_width >> 1); stride > 0; stride >>= 1) {
+        const bool is_second = lane & stride;
+        const KeyT other = shfl_xor(key, stride, warp_width);
+        const bool do_assign =
+            (ascending != is_second) ? key > other : key < other;
+        helpers::conditional_assign(do_assign, key, other);
+        // NB: don't put shfl_xor in a conditional; it must be called by all
+        // threads in a warp.
+        (helpers::conditional_assign(
+             do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+         ...);
+      }
+    }
+  }
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void sort_(
+      bool ascending,
+      int warp_width,
+      KeyT* __restrict__ keys,
+      PayloadTs* __restrict__... payloads) {
+    if constexpr (Size == 1) {
+      const int lane = laneId();
+      for (int width = 2; width < warp_width; width <<= 1) {
+        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+      }
+    } else {
+      constexpr int kSize2 = Size / 2;
+      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_(
+          true, warp_width, keys + kSize2, (payloads + kSize2)...);
+    }
+    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+  }
+};
+} // namespace topk
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/bitonic_topk/pow2_utils.cuh
+++ b/torchaudio/csrc/cuctc/src/bitonic_topk/pow2_utils.cuh
+/**
+ *  Modified from Rapidsai/raft(https://github.com/rapidsai/raft)
+ *
+ */
+#pragma once
+#include <type_traits>
+namespace cu_ctc {
+/**
+ * @brief Give logarithm of the number to base-2
+ * @tparam IntType data type (checked only for integers)
+ */
+template <typename IntType>
+constexpr __device__ IntType log2(IntType num, IntType ret = IntType(0)) {
+  return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
+}
+/**
+ * @brief Fast arithmetics and alignment checks for power-of-two values known at
+ * compile time.
+ *
+ * @tparam Value_ a compile-time value representable as a power-of-two.
+ */
+template <auto Value_>
+struct Pow2 {
+  typedef decltype(Value_) Type;
+  static constexpr Type Value = Value_;
+  static constexpr Type Log2 = log2(Value);
+  static constexpr Type Mask = Value - 1;
+  static_assert(std::is_integral<Type>::value, "Value must be integral.");
+  static_assert(Value && !(Value & Mask), "Value must be power of two.");
+#define Pow2_FUNC_QUALIFIER static constexpr __host__ __device__ __forceinline__
+#define Pow2_WHEN_INTEGRAL(I) std::enable_if_t<Pow2_IS_REPRESENTABLE_AS(I), I>
+#define Pow2_IS_REPRESENTABLE_AS(I) \
+  (std::is_integral<I>::value && Type(I(Value)) == Value)
+  /**
+   * Integer division by Value truncated toward zero
+   * (same as `x / Value` in C++).
+   *
+   *  Invariant: `x = Value * quot(x) + rem(x)`
+   */
+  template <typename I>
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) quot(I x) noexcept {
+    if constexpr (std::is_signed<I>::value)
+      return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
+    if constexpr (std::is_unsigned<I>::value)
+      return x >> I(Log2);
+  }
+  /**
+   *  Remainder of integer division by Value truncated toward zero
+   *  (same as `x % Value` in C++).
+   *
+   *  Invariant: `x = Value * quot(x) + rem(x)`.
+   */
+  template <typename I>
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) rem(I x) noexcept {
+    if constexpr (std::is_signed<I>::value)
+      return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
+    if constexpr (std::is_unsigned<I>::value)
+      return x & I(Mask);
+  }
+  /**
+   * Integer division by Value truncated toward negative infinity
+   * (same as `x // Value` in Python).
+   *
+   * Invariant: `x = Value * div(x) + mod(x)`.
+   *
+   * Note, `div` and `mod` for negative values are slightly faster
+   * than `quot` and `rem`, but behave slightly different
+   * compared to normal C++ operators `/` and `%`.
+   */
+  template <typename I>
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) div(I x) noexcept {
+    return x >> I(Log2);
+  }
+  /**
+   * x modulo Value operation (remainder of the `div(x)`)
+   * (same as `x % Value` in Python).
+   *
+   * Invariant: `mod(x) >= 0`
+   * Invariant: `x = Value * div(x) + mod(x)`.
+   *
+   * Note, `div` and `mod` for negative values are slightly faster
+   * than `quot` and `rem`, but behave slightly different
+   * compared to normal C++ operators `/` and `%`.
+   */
+  template <typename I>
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) mod(I x) noexcept {
+    return x & I(Mask);
+  }
+#define Pow2_CHECK_TYPE(T)                                     \
+  static_assert(                                               \
+      std::is_pointer<T>::value || std::is_integral<T>::value, \
+      "Only pointer or integral types make sense here")
+  /**
+   * Tell whether the pointer or integral is Value-aligned.
+   * NB: for pointers, the alignment is checked in bytes, not in elements.
+   */
+  template <typename PtrT>
+  Pow2_FUNC_QUALIFIER bool isAligned(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT))
+      return mod(p) == 0;
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT))
+      return mod(reinterpret_cast<Type>(p)) == 0;
+  }
+  /** Tell whether two pointers have the same address modulo Value. */
+  template <typename PtrT, typename PtrS>
+  Pow2_FUNC_QUALIFIER bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    Pow2_CHECK_TYPE(PtrS);
+    Type x, y;
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT))
+      x = Type(mod(a));
+    else
+      x = mod(reinterpret_cast<Type>(a));
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrS))
+      y = Type(mod(b));
+    else
+      y = mod(reinterpret_cast<Type>(b));
+    return x == y;
+  }
+  /** Get this or next Value-aligned address (in bytes) or integral. */
+  template <typename PtrT>
+  Pow2_FUNC_QUALIFIER PtrT roundUp(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT))
+      return (p + PtrT(Mask)) & PtrT(~Mask);
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) {
+      auto x = reinterpret_cast<Type>(p);
+      return reinterpret_cast<PtrT>((x + Mask) & (~Mask));
+    }
+  }
+  /** Get this or previous Value-aligned address (in bytes) or integral. */
+  template <typename PtrT>
+  Pow2_FUNC_QUALIFIER PtrT roundDown(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT))
+      return p & PtrT(~Mask);
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) {
+      auto x = reinterpret_cast<Type>(p);
+      return reinterpret_cast<PtrT>(x & (~Mask));
+    }
+  }
+#undef Pow2_CHECK_TYPE
+#undef Pow2_IS_REPRESENTABLE_AS
+#undef Pow2_FUNC_QUALIFIER
+#undef Pow2_WHEN_INTEGRAL
+};
+}; // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/bitonic_topk/warpsort_topk.cuh
+++ b/torchaudio/csrc/cuctc/src/bitonic_topk/warpsort_topk.cuh
+/**
+ *  Modified from Rapidsai/raft(https://github.com/rapidsai/raft)
+ *
+ */
+#pragma once
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+#include "bitonic_sort.cuh"
+#include "pow2_utils.cuh"
+namespace cu_ctc {
+/*
+  Three APIs of different scopes are provided:
+    1. host function: warp_sort_topk()
+    2. block-wide API: class block_sort
+    3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
+  1. warp_sort_topk()
+    (see the docstring)
+  2. class block_sort
+    It can be regarded as a fixed size priority queue for a thread block,
+    although the API is not typical.
+    class warp_sort_filtered and warp_sort_immediate can be used to instantiate
+  block_sort.
+    It uses dynamic shared memory as an intermediate buffer.
+    So the required shared memory size should be calculated using
+    calc_smem_size_for_block_wide() and passed as the 3rd kernel launch
+  parameter.
+    To add elements to the queue, use add(T val, IdxT idx) with unique values
+  per-thread. Use WarpSortClass<...>::kDummy constant for the threads outside of
+  input bounds.
+    After adding is finished, function done() should be called. And finally,
+  store() is used to get the top-k result.
+    Example:
+      __global__ void kernel() {
+        block_sort<warp_sort_immediate, ...> queue(...);
+        for (IdxT i = threadIdx.x; i < len, i += blockDim.x) {
+          queue.add(in[i], in_idx[i]);
+        }
+        queue.done();
+        queue.store(out, out_idx);
+     }
+     int smem_size = calc_smem_size_for_block_wide<T>(...);
+     kernel<<<grid_dim, block_dim, smem_size>>>();
+  3. class warp_sort_filtered and class warp_sort_immediate
+    These two classes can be regarded as fixed size priority queue for a warp.
+    Usage is similar to class block_sort. No shared memory is needed.
+    The host function (warp_sort_topk) uses a heuristic to choose between these
+  two classes for sorting, warp_sort_immediate being chosen when the number of
+  inputs per warp is somewhat small (see the usage of
+  LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
+    Example:
+      __global__ void kernel() {
+        warp_sort_immediate<...> queue(...);
+        int warp_id = threadIdx.x / WarpSize;
+        int lane_id = threadIdx.x % WarpSize;
+        for (IdxT i = lane_id; i < len, i += WarpSize) {
+          queue.add(in[i], idx[i]);
+        }
+        queue.done();
+        // each warp outputs to a different offset
+        queue.store(out + warp_id * k, out_idx + warp_id * k);
+      }
+ */
+namespace topk {
+static constexpr int kMaxCapacity = 256;
+/** Whether 'left` should indeed be on the left w.r.t. `right`. */
+template <bool Ascending, typename T>
+__device__ __forceinline__ auto is_ordered(T left, T right) -> bool {
+  if constexpr (Ascending) {
+    return left < right;
+  }
+  if constexpr (!Ascending) {
+    return left > right;
+  }
+}
+constexpr inline auto calc_capacity(int k) -> int {
+  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
+  return capacity;
+}
+/**
+ * A fixed-size warp-level priority queue.
+ * By feeding the data through this queue, you get the `k <= Capacity`
+ * smallest/greatest values in the data.
+ *
+ * @tparam Capacity
+ *   maximum number of elements in the queue.
+ * @tparam Ascending
+ *   which comparison to use: `true` means `<`, collect the smallest elements,
+ *   `false` means `>`, collect the greatest elements.
+ * @tparam T
+ *   the type of keys (what is being compared)
+ * @tparam IdxT
+ *   the type of payload (normally, indices of elements), i.e.
+ *   the content sorted alongside the keys.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort {
+  static_assert(isPo2(Capacity));
+ public:
+  /**
+   *  The `empty` value for the choosen binary operation,
+   *  i.e. `Ascending ? upper_bound<T>() : lower_bound<T>()`.
+   */
+  static constexpr T kDummy = Ascending ? upper_bound<T>() : lower_bound<T>();
+  /** Width of the subwarp. */
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
+  /** The number of elements to select. */
+  const int k;
+  /**
+   * Construct the warp_sort empty queue.
+   *
+   * @param k
+   *   number of elements to select.
+   */
+  __device__ warp_sort(int k) : k(k) {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      val_arr_[i] = kDummy;
+    }
+  }
+  /**
+   * Load k values from the pointers at the given position, and merge them in
+   * the storage.
+   *
+   * When it actually loads the values, it always performs some collective warp
+   * operations in the end, thus enforcing warp sync. This means, it's safe to
+   * call `store` with the same arguments after `load_sorted` without extra
+   * sync. Note, however, that this is not neccesarily true for the reverse
+   * order, because the access patterns of `store` and `load_sorted` are
+   * different.
+   *
+   * @param[in] in
+   *    a device pointer to a contiguous array, unique per-subwarp
+   *    (length: k <= kWarpWidth * kMaxArrLen).
+   * @param[in] in_idx
+   *    a device pointer to a contiguous array, unique per-subwarp
+   *    (length: k <= kWarpWidth * kMaxArrLen).
+   * @param[in] do_merge
+   *    must be the same for all threads within a subwarp of size `kWarpWidth`.
+   *    It serves as a conditional; when `false` the function does nothing.
+   *    We need it to ensure threads within a full warp don't diverge calling
+   * `bitonic::merge()`.
+   */
+  __device__ void load_sorted(
+      const T* in,
+      const IdxT* in_idx,
+      bool do_merge = true) {
+    if (do_merge) {
+      int idx = Pow2<kWarpWidth>::mod(laneId()) ^ Pow2<kWarpWidth>::Mask;
+#pragma unroll
+      for (int i = kMaxArrLen - 1; i >= 0; --i, idx += kWarpWidth) {
+        if (idx < k) {
+          T t = in[idx];
+          if (is_ordered<Ascending>(t, val_arr_[i])) {
+            val_arr_[i] = t;
+            idx_arr_[i] = in_idx[idx];
+          }
+        }
+      }
+    }
+    if (kWarpWidth < WarpSize || do_merge) {
+      topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth)
+          .merge(val_arr_, idx_arr_);
+    }
+  }
+  /**
+   *  Save the content by the pointer location.
+   *
+   * @param[out] out
+   *   device pointer to a contiguous array, unique per-subwarp of size
+   * `kWarpWidth` (length: k <= kWarpWidth * kMaxArrLen).
+   * @param[out] out_idx
+   *   device pointer to a contiguous array, unique per-subwarp of size
+   * `kWarpWidth` (length: k <= kWarpWidth * kMaxArrLen).
+   */
+  __device__ void store(T* out, IdxT* out_idx) const {
+    int idx = Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll kMaxArrLen
+    for (int i = 0; i < kMaxArrLen && idx < k; i++, idx += kWarpWidth) {
+      out[idx] = val_arr_[i];
+      out_idx[idx] = idx_arr_[i];
+    }
+  }
+ protected:
+  static constexpr int kMaxArrLen = Capacity / kWarpWidth;
+  T val_arr_[kMaxArrLen];
+  IdxT idx_arr_[kMaxArrLen];
+  /**
+   * Merge another array (sorted in the opposite direction) in the queue.
+   * Thanks to the other array being sorted in the opposite direction,
+   * it's enough to call bitonic.merge once to maintain the valid state
+   * of the queue.
+   *
+   * @tparam PerThreadSizeIn
+   *   the size of the other array per-thread (compared to `kMaxArrLen`).
+   *
+   * @param keys_in
+   *   the values to be merged in. Pointers are unique per-thread. The values
+   *   must already be sorted in the opposite direction.
+   *   The layout of `keys_in` must be the same as the layout of `val_arr_`.
+   * @param ids_in
+   *   the associated indices of the elements in the same format as `keys_in`.
+   */
+  template <int PerThreadSizeIn>
+  __device__ __forceinline__ void merge_in(
+      const T* __restrict__ keys_in,
+      const IdxT* __restrict__ ids_in) {
+#pragma unroll
+    for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
+      T& key = val_arr_[kMaxArrLen - i];
+      T other = keys_in[PerThreadSizeIn - i];
+      if (is_ordered<Ascending>(other, key)) {
+        key = other;
+        idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+  }
+};
+/**
+ * This version of warp_sort compares each input element against the current
+ * estimate of k-th value before adding it to the intermediate sorting buffer.
+ * This makes the algorithm do less sorting steps for long input sequences
+ * at the cost of extra checks on each step.
+ *
+ * This implementation is preferred for large len values.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  __device__ warp_sort_filtered(int k)
+      : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(kDummy) {
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      val_buf_[i] = kDummy;
+    }
+  }
+  __device__ void add(T val, IdxT idx) {
+    // comparing for k_th should reduce the total amount of updates:
+    // `false` means the input value is surely not in the top-k values.
+    bool do_add = is_ordered<Ascending>(val, k_th_);
+    // merge the buf if it's full and we cannot add an element anymore.
+    if (any(buf_len_ + do_add > kMaxBufLen)) {
+      // still, add an element before merging if possible for this thread
+      if (do_add && buf_len_ < kMaxBufLen) {
+        add_to_buf_(val, idx);
+        do_add = false;
+      }
+      merge_buf_();
+    }
+    // add an element if necessary and haven't already.
+    if (do_add) {
+      add_to_buf_(val, idx);
+    }
+  }
+  __device__ void done() {
+    if (any(buf_len_ != 0)) {
+      merge_buf_();
+    }
+  }
+ private:
+  __device__ __forceinline__ void set_k_th_() {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    // const int id = (k - 1) / kWarpWidth;
+    const int id = Pow2<kWarpWidth>::div(k - 1);
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      if (i == id) {
+        k_th_ = shfl(val_arr_[i], k - 1, kWarpWidth);
+      }
+    }
+    // k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
+  }
+  __device__ __forceinline__ void merge_buf_() {
+    topk::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+    this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
+    buf_len_ = 0;
+    set_k_th_(); // contains warp sync
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      val_buf_[i] = kDummy;
+    }
+  }
+  __device__ __forceinline__ void add_to_buf_(T val, IdxT idx) {
+    // NB: the loop is used here to ensure the constant indexing,
+    //     to not force the buffers spill into the local memory.
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      if (i == buf_len_) {
+        val_buf_[i] = val;
+        idx_buf_[i] = idx;
+      }
+    }
+    buf_len_++;
+  }
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  static constexpr int kMaxBufLen = (Capacity <= 64) ? 2 : 4;
+  T val_buf_[kMaxBufLen];
+  IdxT idx_buf_[kMaxBufLen];
+  int buf_len_;
+  T k_th_;
+};
+/**
+ * This version of warp_sort adds every input element into the intermediate
+ * sorting buffer, and thus does the sorting step every `Capacity` input
+ * elements.
+ *
+ * This implementation is preferred for very small len values.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  __device__ warp_sort_immediate(int k)
+      : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0) {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      val_buf_[i] = kDummy;
+    }
+  }
+  __device__ void add(T val, IdxT idx) {
+    // NB: the loop is used here to ensure the constant indexing,
+    //     to not force the buffers spill into the local memory.
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; ++i) {
+      if (i == buf_len_) {
+        val_buf_[i] = val;
+        idx_buf_[i] = idx;
+      }
+    }
+    ++buf_len_;
+    if (buf_len_ == kMaxArrLen) {
+      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth)
+          .sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+#pragma unroll
+      for (int i = 0; i < kMaxArrLen; i++) {
+        val_buf_[i] = kDummy;
+      }
+      buf_len_ = 0;
+    }
+  }
+  __device__ void done() {
+    if (buf_len_ != 0) {
+      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth)
+          .sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+    }
+  }
+ private:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  T val_buf_[kMaxArrLen];
+  IdxT idx_buf_[kMaxArrLen];
+  int buf_len_;
+};
+/**
+ * @brief Provide a ceiling division operation ie. ceil(a / b)
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr inline __host__ __device__ IntType ceildiv(IntType a, IntType b) {
+  return (a + b - 1) / b;
+}
+template <typename IntType>
+constexpr inline __device__ IntType roundUp256(IntType num) {
+  // return (num + 255) / 256 * 256;
+  constexpr int MASK = 255;
+  return (num + MASK) & (~MASK);
+}
+template <typename T, typename IdxT>
+auto calc_smem_size_for_block_wide(int num_of_subwarp, int k) -> int {
+  return roundUp256(ceildiv(num_of_subwarp, 2) * sizeof(T) * k) +
+      ceildiv(num_of_subwarp, 2) * sizeof(IdxT) * k;
+}
+template <
+    template <int, bool, typename, typename>
+    class WarpSortWarpWide,
+    int Capacity,
+    bool Ascending,
+    typename T,
+    typename IdxT>
+class block_sort {
+  using queue_t = WarpSortWarpWide<Capacity, Ascending, T, IdxT>;
+ public:
+  __device__ block_sort(int k, uint8_t* smem_buf) : queue_(k) {
+    val_smem_ = reinterpret_cast<T*>(smem_buf);
+    const int num_of_warp = subwarp_align::div(blockDim.x);
+    idx_smem_ = reinterpret_cast<IdxT*>(
+        smem_buf + roundUp256(ceildiv(num_of_warp, 2) * sizeof(T) * k));
+  }
+  __device__ void add(T val, IdxT idx) {
+    queue_.add(val, idx);
+  }
+  /**
+   * At the point of calling this function, the warp-level queues consumed all
+   * input independently. The remaining work to be done is to merge them
+   * together.
+   *
+   * Here we tree-merge the results using the shared memory and block sync.
+   */
+  __device__ void done() {
+    queue_.done();
+    const int warp_id = subwarp_align::div(threadIdx.x);
+    // NB: there is no need for the second __synchthreads between .load_sorted
+    // and .store:
+    //     we shift the pointers every iteration, such that individual warps
+    //     either access the same locations or do not overlap with any of the
+    //     other warps. The access patterns within warps are different for the
+    //     two functions, but .load_sorted implies warp sync at the end, so
+    //     there is no need for __syncwarp either.
+    for (int shift_mask = ~0,
+             nwarps = subwarp_align::div(blockDim.x),
+             split = (nwarps + 1) >> 1;
+         nwarps > 1;
+         nwarps = split, split = (nwarps + 1) >> 1) {
+      if (warp_id < nwarps && warp_id >= split) {
+        int dst_warp_shift = (warp_id - (split & shift_mask)) * queue_.k;
+        queue_.store(val_smem_ + dst_warp_shift, idx_smem_ + dst_warp_shift);
+      }
+      __syncthreads();
+      shift_mask = ~shift_mask; // invert the mask
+      {
+        int src_warp_shift = (warp_id + (split & shift_mask)) * queue_.k;
+        // The last argument serves as a condition for loading
+        //  -- to make sure threads within a full warp do not diverge on
+        //  `bitonic::merge()`
+        queue_.load_sorted(
+            val_smem_ + src_warp_shift,
+            idx_smem_ + src_warp_shift,
+            warp_id < nwarps - split);
+      }
+    }
+  }
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const {
+    if (threadIdx.x < subwarp_align::Value) {
+      queue_.store(out, out_idx);
+    }
+  }
+ private:
+  using subwarp_align = Pow2<queue_t::kWarpWidth>;
+  queue_t queue_;
+  T* val_smem_;
+  IdxT* idx_smem_;
+};
+} // namespace topk
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/ctc_fast_divmod.cuh
+++ b/torchaudio/csrc/cuctc/src/ctc_fast_divmod.cuh
+/**
+ *  Modified from NVIDIA/cutlass(https://github.com/NVIDIA/cutlass)
+ *
+ */
+#pragma once
+namespace cu_ctc {
+template <typename value_t>
+__host__ __device__ __forceinline__ value_t clz(value_t x) {
+  for (int i = 31; i >= 0; --i) {
+    if ((1 << i) & x)
+      return 31 - i;
+  }
+  return 32;
+}
+template <typename value_t>
+__host__ __device__ __forceinline__ value_t find_log2(value_t x) {
+  int a = int(31 - clz(x));
+  a += (x & (x - 1)) != 0; // Round up, add 1 if not a power of 2.
+  return a;
+}
+/**
+ * Find divisor, using find_log2
+ */
+__host__ __device__ __forceinline__ void find_divisor(
+    unsigned int& mul,
+    unsigned int& shr,
+    unsigned int denom) {
+  if (denom == 1) {
+    mul = 0;
+    shr = 0;
+  } else {
+    unsigned int p = 31 + find_log2(denom);
+    unsigned m =
+        unsigned(((1ull << p) + unsigned(denom) - 1) / unsigned(denom));
+    mul = m;
+    shr = p - 32;
+  }
+}
+__host__ __device__ __forceinline__
+    void
+    fast_divmod(
+        int& quo,
+        int& rem,
+        int src,
+        int div,
+        unsigned int mul,
+        unsigned int shr) {
+#if defined(__CUDA_ARCH__)
+  // Use IMUL.HI if div != 1, else simply copy the source.
+  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
+#else
+  quo = int((div != 1) ? int(((int64_t)src * mul) >> 32) >> shr : src);
+#endif
+  // The remainder.
+  rem = src - (quo * div);
+}
+// For long int input
+__host__ __device__ __forceinline__ void fast_divmod(
+    int& quo,
+    int64_t& rem,
+    int64_t src,
+    int div,
+    unsigned int mul,
+    unsigned int shr) {
+#if defined(__CUDA_ARCH__)
+  // Use IMUL.HI if div != 1, else simply copy the source.
+  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
+#else
+  quo = int((div != 1) ? ((src * mul) >> 32) >> shr : src);
+#endif
+  // The remainder.
+  rem = src - (quo * div);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Object to encapsulate the fast division+modulus operation.
+///
+/// This object precomputes two values used to accelerate the computation and is
+/// best used when the divisor is a grid-invariant. In this case, it may be
+/// computed in host code and marshalled along other kernel arguments using the
+/// 'Params' pattern.
+///
+/// Example:
+///
+///
+///   int quotient, remainder, dividend, divisor;
+///
+///   FastDivmod divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmod {
+  int divisor;
+  unsigned int multiplier;
+  unsigned int shift_right;
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally
+  /// expensive.
+  __host__ __device__ __forceinline__ FastDivmod()
+      : divisor(0), multiplier(0), shift_right(0) {}
+  __host__ __device__ __forceinline__ FastDivmod(int divisor_)
+      : divisor(divisor_) {
+    find_divisor(multiplier, shift_right, divisor);
+  }
+  /// Computes integer division and modulus using precomputed values. This is
+  /// computationally inexpensive.
+  __host__ __device__ __forceinline__ void operator()(
+      int& quotient,
+      int& remainder,
+      int dividend) const {
+    fast_divmod(
+        quotient, remainder, dividend, divisor, multiplier, shift_right);
+  }
+  /// Computes integer division and modulus using precomputed values. This is
+  /// computationally inexpensive.
+  ///
+  /// Simply returns the quotient
+  __host__ __device__ __forceinline__ int divmod(int& remainder, int dividend)
+      const {
+    int quotient;
+    fast_divmod(
+        quotient, remainder, dividend, divisor, multiplier, shift_right);
+    return quotient;
+  }
+  /// Computes integer division and modulus using precomputed values. This is
+  /// computationally inexpensive.
+  __host__ __device__ __forceinline__ void operator()(
+      int& quotient,
+      int64_t& remainder,
+      int64_t dividend) const {
+    fast_divmod(
+        quotient, remainder, dividend, divisor, multiplier, shift_right);
+  }
+  /// Computes integer division and modulus using precomputed values. This is
+  /// computationally inexpensive.
+  __host__ __device__ __forceinline__ int divmod(
+      int64_t& remainder,
+      int64_t dividend) const {
+    int quotient;
+    fast_divmod(
+        quotient, remainder, dividend, divisor, multiplier, shift_right);
+    return quotient;
+  }
+};
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/ctc_prefix_decoder.cpp
+++ b/torchaudio/csrc/cuctc/src/ctc_prefix_decoder.cpp
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <cuda_runtime.h>
+#include "include/ctc_prefix_decoder.h"
+#include "include/ctc_prefix_decoder_host.h"
+#include "device_data_wrap.h"
+#include "device_log_prob.cuh"
+namespace cu_ctc {
+struct InternalData {
+  cudaStream_t stream;
+  int lc;
+  int ldc;
+  int bs;
+  int beam;
+  int ldbeam;
+  int time;
+  int ldseq_len;
+  DeviceDataWrap<float2> pprev;
+  DeviceDataWrap<float> ptable;
+  DeviceDataWrap<float> ptablen;
+  DeviceDataWrap<int> clast;
+  DeviceDataWrap<int> clen[2];
+  DeviceDataWrap<int> clist[2];
+  DeviceDataWrap<int> ptid;
+  DeviceDataWrap<float> score;
+  DeviceDataWrap<float> topk_key_buffer;
+  DeviceDataWrap<int> topk_value_buffer;
+  DeviceDataWrap<int> select_seqs;
+  DeviceDataWrap<int> select_seq_lens;
+  LogProb log_prob;
+  int max_select_seq_len;
+};
+std::tuple<size_t, int> calculate_require_buff_and_init_internal_data(
+    InternalData* inter_data,
+    int batch_size,
+    int seq_len,
+    int vocab_size,
+    int beam,
+    std::uintptr_t buff_ptr,
+    size_t buff_size,
+    float* log_prob_data_ptr,
+    int* original_lens,
+    const std::vector<int>& prob_sizes,
+    const std::vector<int>& prob_strides,
+    int blid,
+    float threshold) {
+  if ((batch_size * beam * seq_len * vocab_size) <= 0)
+    return {0, 0};
+  CHECK(prob_sizes.size() == 3, "only support 3D log_prob.");
+  CHECK(prob_strides.size() == 3, "only support 3D log_prob. ");
+  CHECK(
+      prob_sizes[0] == batch_size && prob_sizes[1] == seq_len &&
+          prob_sizes[2] == vocab_size,
+      "batch_size ,seq_len ,vocab_size must match with porb_size");
+  auto align_size = [](size_t size) -> size_t {
+    return (size + ALIGN_BYTES - 1) / ALIGN_BYTES * ALIGN_BYTES;
+  };
+  int lc = vocab_size;
+  int ldc = lc;
+  int ldbeam = ((beam - 1) / 16 + 1) * 16;
+  int ldseq_len = (seq_len + 16 - 1) / 16 * 16;
+  int bs = batch_size;
+  int time = seq_len;
+  size_t require_size = 0;
+  size_t pprev_size = sizeof(float2) * bs * ldbeam;
+  size_t pprev_align_size = align_size(pprev_size);
+  require_size += pprev_align_size;
+  size_t ptable_size = sizeof(float) * (bs * beam * ldc);
+  size_t ptablen_size = sizeof(float) * bs * beam * ldc;
+  size_t ptable_align_size = align_size(ptable_size);
+  size_t ptablen_align_size = align_size(ptablen_size);
+  require_size += ptable_align_size;
+  require_size += ptablen_align_size;
+  size_t clast_align_size = align_size(sizeof(int) * ldbeam * bs);
+  require_size += clast_align_size;
+  size_t clen_align_size = align_size(sizeof(int) * ldbeam * bs);
+  size_t clist_align_size = align_size(sizeof(int) * ldseq_len * beam * bs);
+  require_size += 2 * clen_align_size;
+  require_size += 2 * clist_align_size;
+  size_t ptid_align_size = align_size(sizeof(int) * bs * ldbeam);
+  require_size += ptid_align_size;
+  size_t score_align_size = align_size(sizeof(float) * bs * ldbeam);
+  require_size += score_align_size;
+  size_t key_buff_align_size = align_size(sizeof(float) * beam * MAX_BLOCKS);
+  size_t value_buff_align_size = align_size(sizeof(int) * beam * MAX_BLOCKS);
+  require_size += (key_buff_align_size + value_buff_align_size);
+  size_t select_seqs_align_size =
+      align_size(sizeof(int) * batch_size * seq_len);
+  require_size += select_seqs_align_size;
+  size_t select_seq_lens_align_size = align_size(sizeof(int) * batch_size);
+  require_size += select_seq_lens_align_size;
+  require_size += ALIGN_BYTES;
+  if (require_size > buff_size)
+    return {require_size, 0};
+  char* buff_align_ptr = reinterpret_cast<char*>(align_size(buff_ptr));
+  inter_data->beam = beam;
+  inter_data->ldbeam = ldbeam;
+  inter_data->bs = bs;
+  inter_data->lc = lc;
+  inter_data->ldc = ldc;
+  inter_data->time = time;
+  inter_data->ldseq_len = ldseq_len;
+#define SET_DATA(NAME, TYPE, SIZE)                                         \
+  inter_data->NAME =                                                       \
+      DeviceDataWrap<TYPE>(reinterpret_cast<TYPE*>(buff_align_ptr), SIZE); \
+  buff_align_ptr += SIZE;
+  SET_DATA(pprev, float2, pprev_align_size);
+  SET_DATA(ptable, float, ptable_align_size);
+  SET_DATA(ptablen, float, ptable_align_size);
+  SET_DATA(clast, int, clast_align_size);
+  SET_DATA(clen[0], int, clen_align_size);
+  SET_DATA(clen[1], int, clen_align_size);
+  SET_DATA(clist[0], int, clist_align_size);
+  SET_DATA(clist[1], int, clist_align_size);
+  SET_DATA(ptid, int, ptid_align_size);
+  SET_DATA(score, float, score_align_size);
+  SET_DATA(topk_key_buffer, float, key_buff_align_size);
+  SET_DATA(topk_value_buffer, int, value_buff_align_size);
+  SET_DATA(select_seqs, int, select_seqs_align_size);
+  SET_DATA(select_seq_lens, int, select_seq_lens_align_size);
+#undef SET_DATA
+  // init log_prob
+  inter_data->log_prob.data_ptr = log_prob_data_ptr;
+  inter_data->log_prob.origin_seq_lens = original_lens;
+  inter_data->log_prob.select_seqs = inter_data->select_seqs.data_ptr();
+  inter_data->log_prob.select_seq_lens = inter_data->select_seq_lens.data_ptr();
+  inter_data->log_prob.batch = batch_size;
+  inter_data->log_prob.vocab_size = vocab_size;
+  inter_data->log_prob.seq_len = seq_len;
+  inter_data->log_prob.batch_stride = prob_strides[0];
+  inter_data->log_prob.seq_len_stride = prob_strides[1];
+  inter_data->log_prob.vocab_stride = prob_strides[2];
+  inter_data->max_select_seq_len = init_log_prob_and_cal_max_select_seq_len(
+      &(inter_data->log_prob), blid, threshold, inter_data->stream);
+  return {0, inter_data->max_select_seq_len};
+}
+int prefixCTC_V2(
+    InternalData* inter_data,
+    int blid,
+    int spid,
+    int step,
+    bool is_last_step,
+    int max_select_seq_len) {
+  LogProb* log_prob_struct = &(inter_data->log_prob);
+  if (step == 0) {
+    CTC_prob_first_step_V2(
+        log_prob_struct,
+        step,
+        inter_data->pprev,
+        inter_data->ptid,
+        inter_data->clast,
+        inter_data->clen[step % 2],
+        inter_data->clist[step % 2],
+        inter_data->beam,
+        inter_data->ldbeam,
+        inter_data->ldseq_len,
+        inter_data->bs,
+        inter_data->score,
+        inter_data->stream,
+        blid);
+  } else {
+    CTC_prob_matrix_V2(
+        log_prob_struct,
+        step,
+        inter_data->pprev,
+        inter_data->ptable,
+        inter_data->ptablen,
+        inter_data->clast,
+        inter_data->lc,
+        inter_data->ldc,
+        inter_data->beam,
+        inter_data->ldbeam,
+        inter_data->bs,
+        blid,
+        spid,
+        inter_data->stream);
+    CTC_prob_merge_V2(
+        log_prob_struct,
+        step,
+        inter_data->ptable,
+        inter_data->ptablen,
+        inter_data->ptid,
+        inter_data->clast,
+        inter_data->clist[(step % 2) ^ 1],
+        inter_data->clen[(step % 2) ^ 1],
+        inter_data->lc,
+        inter_data->ldc,
+        inter_data->beam,
+        inter_data->ldbeam,
+        inter_data->ldseq_len,
+        inter_data->bs,
+        inter_data->stream,
+        blid);
+    CTC_prob_topK_V2(
+        log_prob_struct,
+        step,
+        inter_data->pprev,
+        inter_data->ptable,
+        inter_data->ptablen,
+        inter_data->ptid,
+        inter_data->clast,
+        inter_data->clen[(step % 2) ^ 1],
+        inter_data->clen[(step % 2)],
+        inter_data->clist[(step % 2) ^ 1],
+        inter_data->clist[(step % 2)],
+        inter_data->lc,
+        inter_data->ldc,
+        inter_data->beam,
+        inter_data->ldbeam,
+        inter_data->ldseq_len,
+        blid,
+        inter_data->bs,
+        inter_data->score,
+        inter_data->topk_key_buffer,
+        inter_data->topk_value_buffer,
+        inter_data->stream,
+        is_last_step);
+    if (is_last_step) {
+      // if the parity of select_seq_len is different from the
+      // max_select_seq_len, their clist and clen need to be copy to another
+      // clist and clen
+      CTC_copy_list_len_for_differnet_parity(
+          log_prob_struct,
+          step,
+          max_select_seq_len,
+          inter_data->clen[(step % 2) ^ 1],
+          inter_data->clen[(step % 2)],
+          inter_data->clist[(step % 2) ^ 1],
+          inter_data->clist[(step % 2)],
+          inter_data->bs,
+          inter_data->beam,
+          inter_data->ldbeam,
+          inter_data->ldseq_len,
+          inter_data->stream);
+    }
+  }
+  return 0;
+}
+std::uintptr_t prefixCTC_alloc(std::uintptr_t stream_ptr) {
+  InternalData* Inter_data = new InternalData;
+  Inter_data->stream = reinterpret_cast<cudaStream_t>(stream_ptr);
+  return reinterpret_cast<std::uintptr_t>(Inter_data);
+}
+void prefixCTC_free(std::uintptr_t inter_data_ptr) {
+  InternalData* inter_data = reinterpret_cast<InternalData*>(inter_data_ptr);
+  delete inter_data;
+}
+int ctc_beam_search_decoder_batch_gpu(
+    InternalData* inter_data,
+    float* pp,
+    int blid,
+    int spid,
+    int* clist,
+    int* clen,
+    float* score) {
+  // batch_pprev: time x batch x lc
+  // internal_data *data = (internal_data *)data_int;
+  CUDA_CHECK(cudaMemsetAsync(
+      (inter_data->clast.data_ptr()),
+      0,
+      inter_data->clast.size_in_byte(),
+      inter_data->stream));
+  CUDA_CHECK(cudaMemsetAsync(
+      (inter_data->clen[0].data_ptr()),
+      0,
+      inter_data->clen[0].size_in_byte(),
+      inter_data->stream));
+  CUDA_CHECK(cudaMemsetAsync(
+      (inter_data->clen[1].data_ptr()),
+      0,
+      inter_data->clen[0].size_in_byte(),
+      inter_data->stream));
+  CUDA_CHECK(cudaMemsetAsync(
+      (inter_data->clist[0].data_ptr()),
+      -1,
+      inter_data->clen[0].size_in_byte(),
+      inter_data->stream));
+  CUDA_CHECK(cudaMemsetAsync(
+      (inter_data->clist[1].data_ptr()),
+      -1,
+      inter_data->clen[0].size_in_byte(),
+      inter_data->stream));
+  // ptable  the table of prob for end_in_bank (bs*beam*vocab_size)
+  // ptablen the table of prob for no_end_in_bank(ba*beam*vocab_size)
+  int step = 0;
+  while (step < inter_data->max_select_seq_len) {
+    bool is_last_step = (step == (inter_data->max_select_seq_len - 1));
+    prefixCTC_V2(
+        inter_data,
+        blid,
+        spid,
+        step,
+        is_last_step,
+        inter_data->max_select_seq_len);
+    step++;
+  }
+  CUDA_CHECK(cudaMemcpy2DAsync(
+      clen,
+      sizeof(int) * inter_data->beam,
+      inter_data->clen[(step % 2) ^ 1].data_ptr(),
+      sizeof(int) * inter_data->ldbeam,
+      sizeof(int) * inter_data->beam,
+      inter_data->bs,
+      cudaMemcpyDeviceToHost,
+      inter_data->stream));
+  CUDA_CHECK(cudaMemcpy2DAsync(
+      clist,
+      sizeof(int) * inter_data->max_select_seq_len,
+      inter_data->clist[(step % 2) ^ 1].data_ptr(),
+      sizeof(int) * inter_data->ldseq_len,
+      sizeof(int) * inter_data->max_select_seq_len,
+      inter_data->beam * inter_data->bs,
+      cudaMemcpyDeviceToHost,
+      inter_data->stream));
+  CUDA_CHECK(cudaMemcpy2DAsync(
+      score,
+      sizeof(float) * inter_data->beam,
+      inter_data->score.data_ptr(),
+      sizeof(float) * inter_data->ldbeam,
+      sizeof(float) * inter_data->beam,
+      inter_data->bs,
+      cudaMemcpyDeviceToHost,
+      inter_data->stream));
+  CUDA_CHECK(cudaStreamSynchronize(inter_data->stream));
+  return 0;
+}
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/ctc_prefix_decoder_kernel_v2.cu
+++ b/torchaudio/csrc/cuctc/src/ctc_prefix_decoder_kernel_v2.cu
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <algorithm>
+#include "ctc_fast_divmod.cuh"
+#include "cub/cub.cuh"
+#include "device_data_wrap.h"
+#include "device_log_prob.cuh"
+#include "include/ctc_prefix_decoder_host.h"
+#include "bitonic_topk/warpsort_topk.cuh"
+namespace cu_ctc {
+__inline__ __device__ float _lauguage() {
+  return 1.0f;
+}
+__inline__ __device__ float _logprob(float a, float b) {
+  return a + b;
+}
+__inline__ __device__ float _logsumexp(float a, float b) {
+  float max_ab = a > b ? a : b;
+  float neg_abs_ab = (a - b) > 0 ? (b - a) : (a - b);
+  return max_ab + __logf(1 + __expf(neg_abs_ab));
+}
+__inline__ __device__ bool compare(int len, int* a, int* b) {
+  for (int i = 0; i < len; i++)
+    if (a[i] != b[i])
+      return 0;
+  return 1;
+}
+template <
+    int BLOCK_SIZE,
+    int ITEMS_PER_THREAD,
+    typename KeyT,
+    typename ValueT,
+    typename BLOCK_TOPK_FUN,
+    typename SET_KEY_VALUE_FUN>
+__device__ __forceinline__ void block_topk_striped_wrap_with_default_key(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    const int k,
+    const int valid_count_this_block,
+    const KeyT default_key,
+    BLOCK_TOPK_FUN& block_topk_fun,
+    SET_KEY_VALUE_FUN& set_key_value_fun) {
+  const int tx = threadIdx.x;
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    int idx = BLOCK_SIZE * ITEM + tx;
+    if (idx < valid_count_this_block) {
+      set_key_value_fun(keys[ITEM], values[ITEM], idx);
+    } else {
+      keys[ITEM] = default_key;
+    }
+  }
+  const int valid_count_this_iter =
+      (valid_count_this_block < (BLOCK_SIZE * ITEMS_PER_THREAD))
+      ? valid_count_this_block
+      : (BLOCK_SIZE * ITEMS_PER_THREAD);
+  block_topk_fun(keys, values, k, valid_count_this_iter);
+  __syncthreads();
+  const int stride = BLOCK_SIZE * ITEMS_PER_THREAD - k;
+  for (int idx_offset = ITEMS_PER_THREAD * BLOCK_SIZE;
+       idx_offset < valid_count_this_block;
+       idx_offset += stride) {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+      int local_idx = BLOCK_SIZE * ITEM + tx - k;
+      int target_idx = idx_offset + local_idx;
+      if (local_idx >= 0 && target_idx < valid_count_this_block) {
+        set_key_value_fun(keys[ITEM], values[ITEM], target_idx);
+      }
+      if (target_idx >= valid_count_this_block) {
+        keys[ITEM] = default_key;
+      }
+    }
+    const int iter_valid_count =
+        ((valid_count_this_block - idx_offset) >= stride)
+        ? (BLOCK_SIZE * ITEMS_PER_THREAD)
+        : (k + valid_count_this_block - idx_offset);
+    block_topk_fun(keys, values, k, iter_valid_count);
+    __syncthreads();
+  }
+}
+__global__ void prob_matrix_v2_kernel(
+    LogProb log_prob_struct,
+    int step,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* clast,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int bs,
+    int blid,
+    int spid)
+{
+  const int batch_id = blockIdx.y;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride = blockDim.x * gridDim.x;
+  if (!log_prob_struct.need_process_on_ith_step(batch_id, step))
+    return;
+  const int select_seq =
+      log_prob_struct.ith_selected_seq_in_this_batch(batch_id, step);
+  if (batch_id >= bs || tid >= (lc * beam))
+    return;
+  for (; tid < (lc * beam); tid += stride) {
+    int beamid = tid / lc;
+    int charid = tid - beamid * lc;
+    if ((charid != blid) && charid != spid) {
+      int idout = charid + (beamid + batch_id * beam) * ldc;
+      int target_clast = clast[batch_id * ldbeam + beamid];
+      float cur_prob = log_prob_struct.at(batch_id, select_seq, charid);
+      float out_prob;
+      float2 beamid_p = pprev[batch_id * ldbeam + beamid];
+      if (target_clast == charid) {
+        out_prob = _logprob(cur_prob, beamid_p.x);
+        float out_prob_prefix = _logprob(cur_prob, beamid_p.y);
+        int idout_prefix = blid + (batch_id * beam + beamid) * ldc;
+        ptablen[idout_prefix] = out_prob_prefix;
+      } else {
+        out_prob = _logprob(cur_prob, _logsumexp(beamid_p.x, beamid_p.y));
+      }
+      ptable[idout] = -FLT_MAX;
+      ptablen[idout] = out_prob;
+    }
+  }
+}
+__global__ void prob_space_blank_kernel_v2(
+    LogProb log_prob_struct,
+    int step,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* clast,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int bs,
+    int blid,
+    int spid) {
+  const int batch_id = blockIdx.y;
+  if (!log_prob_struct.need_process_on_ith_step(batch_id, step))
+    return;
+  const int select_seq =
+      log_prob_struct.ith_selected_seq_in_this_batch(batch_id, step);
+  const int beamid = threadIdx.x;
+  if (beamid < beam) {
+    // assume blank at 0
+    float pc = log_prob_struct.at(batch_id, select_seq, blid);
+    float2 tmpprev = pprev[batch_id * ldbeam + beamid];
+    int last_char = clast[batch_id * ldbeam + beamid];
+    int idout = blid + (batch_id * beam + beamid) * ldc;
+    ptable[idout] = _logprob(pc, _logsumexp(tmpprev.x, tmpprev.y));
+    if (last_char == blid)
+      ptablen[idout] = -FLT_MAX;
+  }
+  if (spid >= 0 && (spid != blid) && beamid < beam) {
+    float pc = log_prob_struct.at(batch_id, select_seq, spid);
+    float2 tmpprev = pprev[batch_id * ldbeam + beamid];
+    int idout = spid + (batch_id * beam + beamid) * ldc;
+    ptablen[idout] = _lauguage() *
+        _logprob(pc, _logsumexp(tmpprev.x, tmpprev.y)); // logsumexp
+    ptable[idout] = -FLT_MAX;
+  }
+}
+__global__ void matrix_merge_kernel_v2(
+    LogProb log_prob_struct,
+    int step,
+    float* ptable,
+    float* ptablen,
+    int* ptid,
+    int* clast,
+    int* clist,
+    int* clen,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int bs,
+    int blid) {
+  // not produce the "l+ not in Aprev" part. If do this, need use ptalbe(n)@t-1
+  // this is a little kernel & latency dependency & almost no parallel
+  // each thread produce one beam .vs. one beam.
+  // block=beam,thread=beam (if beam<32, can use one block for optim)
+  const int batch_id = blockIdx.y;
+  if (!log_prob_struct.need_process_on_ith_step(batch_id, step))
+    return;
+  const int select_seq =
+      log_prob_struct.ith_selected_seq_in_this_batch(batch_id, step);
+  __shared__ int tmpclen[128]; // beam<128
+  int tidin, tidout;
+  if (threadIdx.x < beam) {
+    tmpclen[threadIdx.x] = clen[threadIdx.x + blockIdx.y * ldbeam];
+  }
+  __syncthreads();
+  if (threadIdx.x < beam &&
+      ((tmpclen[threadIdx.x] - 1) ==
+       tmpclen[blockIdx.x])) { // char=blank && belong to the same beam @t-1; if
+                               // not meet, the whole block will not calculate.
+                               // delta(L)=1
+    if (compare(
+            tmpclen[blockIdx.x],
+            clist + threadIdx.x * ldseq_len + blockIdx.y * ldseq_len * beam,
+            clist + blockIdx.x * ldseq_len + blockIdx.y * ldseq_len * beam)) {
+      tidin = clast[threadIdx.x + blockIdx.y * ldbeam] +
+          (blockIdx.x + blockIdx.y * beam) * ldc;
+      tidout = blid + (threadIdx.x + blockIdx.y * beam) * ldc;
+      ptable[tidout] = _logsumexp(ptable[tidout], ptable[tidin]);
+      ptablen[tidout] = _logsumexp(ptablen[tidout], ptablen[tidin]);
+      ptable[tidin] = -FLT_MAX;
+      ptablen[tidin] = -FLT_MAX;
+    }
+  }
+}
+template <int BLOCK_SIZE, int Capacity>
+__global__ __launch_bounds__(BLOCK_SIZE) void first_matrix__bitonic_topk_kernel(
+    LogProb log_prob_struct,
+    int step,
+    float2* pprev,
+    int* ptid,
+    int* clast,
+    int* clen,
+    int* clist,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int blid,
+    int bs,
+    float* score,
+    int smem_result_byte_offset) {
+  const int batch_id = blockIdx.x;
+  const int tx = threadIdx.x;
+  if (!log_prob_struct.need_process_on_ith_step(batch_id, step))
+    return;
+  const bool is_need_add_blank = log_prob_struct.need_add_blank(batch_id, step);
+  const int select_seq =
+      log_prob_struct.ith_selected_seq_in_this_batch(batch_id, step);
+  const int vocab_size = log_prob_struct.vocab_size;
+  extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
+  constexpr bool Ascending = false;
+  using namespace cu_ctc::topk;
+  block_sort<warp_sort_filtered, Capacity, Ascending, float, int> queue(
+      beam, smem_buf_bytes);
+  const int per_thread_lim = vocab_size + laneId();
+  for (int id = tx; id < per_thread_lim; id += BLOCK_SIZE) {
+    float key = (id < vocab_size)
+        ? (log_prob_struct.at(batch_id, select_seq, id))
+        : (warp_sort_filtered<Capacity, Ascending, float, int>::kDummy);
+    int value = id;
+    queue.add(key, value);
+  }
+  queue.done();
+  float* block_topk_key =
+      reinterpret_cast<float*>(smem_buf_bytes + smem_result_byte_offset);
+  int* block_topk_value =
+      reinterpret_cast<int*>(block_topk_key + sizeof(float) * beam);
+  queue.store(block_topk_key, block_topk_value);
+  for (int idx = tx; idx < beam; idx += BLOCK_SIZE) {
+    int id = block_topk_value[idx];
+    float key = block_topk_key[idx];
+    int shift = clen[idx + batch_id * ldbeam];
+    if (id != blid) {
+      float2 xy =
+          is_need_add_blank ? float2{key, -FLT_MAX} : float2{-FLT_MAX, key};
+      pprev[batch_id * ldbeam + idx] = xy;
+      clist[batch_id * beam * ldseq_len + idx * ldseq_len + shift] = id;
+      clen[batch_id * ldbeam + idx] += 1;
+      clast[batch_id * ldbeam + idx] = id;
+    } else {
+      pprev[batch_id * ldbeam + idx] = float2{key, -FLT_MAX};
+    }
+    score[batch_id * ldbeam + idx] = key;
+  }
+}
+template <int BLOCK_SIZE, int Capacity>
+__global__
+__launch_bounds__(BLOCK_SIZE) void bitonic_topk_multi_block_per_batch_kernel(
+    LogProb log_prob_struct,
+    int step,
+    const float* ptable,
+    const float* ptablen,
+    int lc,
+    int ldc,
+    int beam,
+    int bs,
+    float* topk_key_buffer,
+    int* topk_value_buffer,
+    FastDivmod ldc_fast_divmod) {
+  const int batch_id = blockIdx.y;
+  if (batch_id >= bs)
+    return;
+  if (!log_prob_struct.need_process_on_ith_step(batch_id, step))
+    return;
+  extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
+  constexpr bool Ascending = false;
+  using namespace cu_ctc::topk;
+  block_sort<warp_sort_filtered, Capacity, Ascending, float, int> queue(
+      beam, smem_buf_bytes);
+  const int bx = blockIdx.x;
+  const int blocks_per_batch = gridDim.x;
+  const int all_items_per_batch = ldc * beam;
+  const int stride = blocks_per_batch * BLOCK_SIZE;
+  const int gid = threadIdx.x + bx * BLOCK_SIZE;
+  const int block_out_offset = (batch_id * blocks_per_batch + bx) * beam;
+  const int per_thread_lim = all_items_per_batch + laneId();
+  for (int id = gid; id < per_thread_lim; id += stride) {
+    float key = warp_sort_filtered<Capacity, Ascending, float, int>::kDummy;
+    int value = id;
+    if (id < all_items_per_batch) {
+      int quotient;
+      int reminder;
+      ldc_fast_divmod(quotient, reminder, id); // reminder = id%lc;
+      if (reminder < lc) {
+        int tidin = batch_id * all_items_per_batch + id;
+        float p = ptable[tidin];
+        float pn = ptablen[tidin];
+        key = _logsumexp(p, pn);
+      }
+    }
+    queue.add(key, value);
+  }
+  queue.done();
+  queue.store(
+      topk_key_buffer + block_out_offset, topk_value_buffer + block_out_offset);
+}
+template <int BLOCK_SIZE, int ITEMS_PER_THREAD, int WRITE_THREDS = 8>
+__global__
+__launch_bounds__(BLOCK_SIZE) void topk_reduce_and_copy_list_per_batch_kernel(
+    LogProb log_prob_struct,
+    int step,
+    int beam,
+    int items_per_batch,
+    int bs,
+    float* topk_key_buffer,
+    int* topk_value_buffer,
+    int ldc,
+    int ldbeam,
+    int ldseq_len,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* clast,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int blid,
+    float* score) {
+  constexpr int MAX_SUPPORT_BEAM = 128;
+  int batch_id = blockIdx.x;
+  int rw_offset_this_block = batch_id * items_per_batch;
+  if (batch_id >= bs)
+    return;
+  if (!log_prob_struct.need_process_on_ith_step(batch_id, step))
+    return;
+  const bool is_need_add_blank = log_prob_struct.need_add_blank(batch_id, step);
+  const int tx = threadIdx.x;
+  using BlockRadixSortT =
+      cub::BlockRadixSort<float, BLOCK_SIZE, ITEMS_PER_THREAD, int>;
+  __shared__ union {
+    typename BlockRadixSortT::TempStorage temp_storage;
+#ifdef USE_PARALLEL_WRITE
+    constexpr int smem_size = MAX_SUPPORT_BEAM * (sizeof(float) + sizeof(int));
+    uint8_t topk_key_value_smem[smem_size];
+#endif
+    /* data */
+  } ShareSmem;
+  float topk_keys[ITEMS_PER_THREAD];
+  int topk_values[ITEMS_PER_THREAD];
+  auto block_topk_fun = [&](float(&keys)[ITEMS_PER_THREAD],
+                            int(&values)[ITEMS_PER_THREAD],
+                            const int k,
+                            const int valid_count_this_iter) {
+    BlockRadixSortT{ShareSmem.temp_storage}.SortDescendingBlockedToStriped(
+        keys, values);
+  };
+  auto set_key_value = [&](float& key, int& value, int idx) {
+    key = topk_key_buffer[idx + rw_offset_this_block];
+    value = topk_value_buffer[idx + rw_offset_this_block];
+  };
+  block_topk_striped_wrap_with_default_key<
+      BLOCK_SIZE,
+      ITEMS_PER_THREAD,
+      float,
+      int>(
+      topk_keys,
+      topk_values,
+      beam,
+      items_per_batch,
+      cub::FpLimits<float>::Lowest(),
+      block_topk_fun,
+      set_key_value);
+  // write result in global memory
+  __syncthreads();
+#ifdef USE_PARALLEL_WRITE
+  float* smem_keys =
+      reinterpret_cast<float*>(&(ShareSmem.topk_key_value_smem[0]));
+  int* smem_values = reinterpret_cast<int*>(
+      ShareSmem.topk_key_value_smem + MAX_SUPPORT_BEAM * sizeof(float));
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+    int idx = BLOCK_SIZE * ITEM + tx;
+    if (idx < beam) {
+      smem_keys[idx] = topk_keys[ITEM];
+      smem_values[idx] = topk_values[ITEM];
+    }
+  }
+  __syncthreads();
+  const int sub_warp_id = tx / WRITE_THREDS;
+  const int tid_in_subw = tx % WRITE_THREDS;
+  const int sub_warps = BLOCK_SIZE / WRITE_THREDS;
+  for (int out_beamid = sub_warp_id; out_beamid < beam;
+       out_beamid += sub_warps) {
+    int id = smem_values[out_beamid];
+    int beamid = id / ldc;
+    int charid = id - beamid * ldc; // id%ldc
+    int prevlen = clen[beamid + batch_id * ldbeam];
+    // PARALLEL_WRITE
+    for (int i = tid_in_subw; i < prevlen; i += WRITE_THREDS) {
+      clist2[batch_id * beam * ldseq_len + out_beamid * ldseq_len + i] =
+          clist[batch_id * beam * ldseq_len + beamid * ldseq_len + i];
+    }
+    if (tid_in_subw == 0) {
+      if (charid == blid) {
+        clast[batch_id * ldbeam + out_beamid] =
+            clast[beamid + batch_id * ldbeam];
+        clen2[batch_id * ldbeam + out_beamid] = prevlen;
+      } else {
+        clast[batch_id * ldbeam + out_beamid] = charid;
+        clen2[batch_id * ldbeam + out_beamid] = prevlen + 1;
+        clist2[batch_id * beam * ldseq_len + out_beamid * ldseq_len + prevlen] =
+            charid;
+      }
+      float2 ptable_ptablen;
+      ptable_ptablen.x = ptable[batch_id * ldc * beam + id];
+      ptable_ptablen.y = ptablen[batch_id * ldc * beam + id];
+      float cur_score = _logsumexp(ptable_ptablen.x, ptable_ptablen.y);
+      score[batch_id * ldbeam + out_beamid] = cur_score;
+      float2 ptable_ptablen2 = float2{cur_score, -FLT_MAX};
+      pprev[batch_id * ldbeam + out_beamid] =
+          is_need_add_blank ? ptable_ptablen2 : ptable_ptablen;
+    }
+  }
+#else
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
+      int idx = BLOCK_SIZE * ITEM + tx;
+      if (idx < beam) {
+        int id = topk_values[ITEM];
+        int beamid = id / ldc;
+        int charid = id - beamid * ldc; // id%ldc
+        int prevlen = clen[beamid + batch_id * ldbeam];
+        int prevclast = clast[beamid + batch_id * ldbeam];
+        for (int i = 0; i < prevlen; i++) {
+          clist2[batch_id * beam * ldseq_len + idx * ldseq_len + i] =
+              clist[batch_id * beam * ldseq_len + beamid * ldseq_len + i];
+        }
+        if (charid == blid) {
+          clast[batch_id * ldbeam + idx] = prevclast;
+          clen2[batch_id * ldbeam + idx] = prevlen;
+        } else {
+          clast[batch_id * ldbeam + idx] = charid;
+          clen2[batch_id * ldbeam + idx] = prevlen + 1;
+          clist2[batch_id * beam * ldseq_len + idx * ldseq_len + prevlen] =
+              charid;
+        }
+        float2 ptable_ptablen;
+        ptable_ptablen.x = ptable[batch_id * ldc * beam + id];
+        ptable_ptablen.y = ptablen[batch_id * ldc * beam + id];
+        float cur_score = _logsumexp(ptable_ptablen.x, ptable_ptablen.y);
+        score[batch_id * ldbeam + idx] = cur_score;
+        float2 ptable_ptablen2 = float2{cur_score, -FLT_MAX};
+        pprev[batch_id * ldbeam + idx] =
+            is_need_add_blank ? ptable_ptablen2 : ptable_ptablen;
+      }
+    }
+  }
+#endif
+}
+int CTC_prob_matrix_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* clast,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int bs,
+    int blid,
+    int spid,
+    cudaStream_t stream) {
+  dim3 grid, block;
+  block.x = 256, block.y = 1, block.z = 1;
+  grid.x = min((lc * beam + block.x - 1) / block.x, MAX_BLOCKS / bs);
+  grid.y = bs;
+  grid.z = 1;
+  prob_matrix_v2_kernel<<<grid, block, 0, stream>>>(
+      (*log_prob_struct),
+      step,
+      pprev,
+      ptable,
+      ptablen,
+      clast,
+      lc,
+      ldc,
+      beam,
+      ldbeam,
+      bs,
+      blid,
+      spid);
+  block.x = ldbeam, block.y = 1, block.z = 1;
+  grid.x = 1, grid.y = bs, grid.z = 1;
+  CHECK(ldbeam <= 1024, " only support  beam<=1024");
+  prob_space_blank_kernel_v2<<<grid, block, 0, stream>>>(
+      (*log_prob_struct),
+      step,
+      pprev,
+      ptable,
+      ptablen,
+      clast,
+      lc,
+      ldc,
+      beam,
+      ldbeam,
+      bs,
+      blid,
+      spid);
+  return 0;
+}
+int CTC_prob_first_step_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float2* pprev,
+    int* ptid,
+    int* clast,
+    int* clen,
+    int* clist,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int bs,
+    float* score,
+    cudaStream_t stream,
+    int blid) {
+  CHECK(beam <= 128, "ERROR: only support beam size <=128 ");
+  constexpr int threads_per_block = 256;
+  const int grid = bs;
+  constexpr int Capacity = 16;
+  using FunType =
+      decltype(first_matrix__bitonic_topk_kernel<threads_per_block, Capacity>);
+  static FunType* FirstMatrixFuns[5]{
+      first_matrix__bitonic_topk_kernel<threads_per_block, 8>,
+      first_matrix__bitonic_topk_kernel<threads_per_block, 16>,
+      first_matrix__bitonic_topk_kernel<threads_per_block, 32>,
+      first_matrix__bitonic_topk_kernel<threads_per_block, 64>,
+      first_matrix__bitonic_topk_kernel<threads_per_block, 128>};
+  int need_capacity = topk::calc_capacity(beam);
+  int fun_idx = 0;
+  fun_idx = std::max(0, 31 - clz(need_capacity) - 3);
+  int actual_capacity = (1 << (fun_idx + 3));
+  int num_of_subwarp = threads_per_block / std::min<int>(32, actual_capacity);
+  int block_sort_smem_size = cu_ctc::topk::roundUp256(
+      cu_ctc::topk::calc_smem_size_for_block_wide<float, int>(
+          num_of_subwarp, beam));
+  int smem_size =
+      block_sort_smem_size + beam * sizeof(float) + beam * sizeof(int);
+  FirstMatrixFuns[fun_idx]<<<grid, threads_per_block, smem_size, stream>>>(
+      (*log_prob_struct),
+      step,
+      pprev,
+      ptid,
+      clast,
+      clen,
+      clist,
+      beam,
+      ldbeam,
+      ldseq_len,
+      blid,
+      bs,
+      score,
+      block_sort_smem_size);
+  return 0;
+}
+int CTC_prob_merge_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float* ptable,
+    float* ptablen,
+    int* ptid,
+    int* clast,
+    int* clist,
+    int* clen,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int bs,
+    cudaStream_t stream,
+    int blid) {
+  dim3 grid, block;
+  int smem;
+  block.x = ldbeam, block.y = 1, block.z = 1;
+  grid.x = beam, grid.y = bs, grid.z = 1;
+  smem = 0;
+  matrix_merge_kernel_v2<<<grid, block, smem, stream>>>(
+      (*log_prob_struct),
+      step,
+      ptable,
+      ptablen,
+      ptid,
+      clast,
+      clist,
+      clen,
+      lc,
+      ldc,
+      beam,
+      ldbeam,
+      ldseq_len,
+      bs,
+      blid);
+  return 0;
+}
+int CTC_prob_topK_V2(
+    LogProb* log_prob_struct,
+    int step,
+    float2* pprev,
+    float* ptable,
+    float* ptablen,
+    int* ptid,
+    int* clast,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int lc,
+    int ldc,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    int blid,
+    int bs,
+    float* score,
+    float* topk_key_buff,
+    int* topk_value_buff,
+    cudaStream_t stream,
+    bool is_last_step) {
+  CHECK(beam <= 128, "ERROR: only support beam size <=128 ");
+  int all_items_per_batch = ldc * beam;
+  constexpr int items_per_thread0 = 4;
+  // #define USE_BLOCKS_PER_BATCH 4
+#ifdef USE_BLOCKS_PER_BATCH
+  constexpr int threads_per_block0 = 256;
+  constexpr int items_per_block_per_iter0 =
+      threads_per_block0 * items_per_thread0;
+  int bxs =
+      min(USE_BLOCKS_PER_BATCH,
+          (all_items_per_batch + items_per_block_per_iter0 - 1) /
+              items_per_block_per_iter0);
+  CHECK(
+      bxs * bs <= MAX_BLOCKS,
+      " ERROR: (batch_size * USE_BLOCKS_PER BATCH) should <=MAX_BLOCKS");
+#else
+  int max_bxs_per_batch = std::max(1, MAX_BLOCKS / bs);
+  constexpr int MAX_BLOCKS_PER_BATCH = 16;
+  max_bxs_per_batch = std::min(MAX_BLOCKS_PER_BATCH, max_bxs_per_batch);
+  constexpr int threads_per_block0 = 128;
+  constexpr int items_per_block_per_iter0 =
+      threads_per_block0 * items_per_thread0;
+  int bxs =
+      min(max_bxs_per_batch,
+          (all_items_per_batch + items_per_block_per_iter0 - 1) /
+              items_per_block_per_iter0);
+#endif
+  dim3 grid(bxs, bs);
+  dim3 block(threads_per_block0);
+  FastDivmod ldc_fast_div{ldc};
+  constexpr int Capacity = 32; // 8,16,32,64,128
+  using FunType = decltype(bitonic_topk_multi_block_per_batch_kernel<
+                           threads_per_block0,
+                           Capacity>);
+  static FunType* BitonicTopkFuns[5]{
+      bitonic_topk_multi_block_per_batch_kernel<threads_per_block0, 8>,
+      bitonic_topk_multi_block_per_batch_kernel<threads_per_block0, 16>,
+      bitonic_topk_multi_block_per_batch_kernel<threads_per_block0, 32>,
+      bitonic_topk_multi_block_per_batch_kernel<threads_per_block0, 64>,
+      bitonic_topk_multi_block_per_batch_kernel<threads_per_block0, 128>};
+  int need_capacity = topk::calc_capacity(beam);
+  int fun_idx = 0;
+  fun_idx = std::max(0, 31 - clz(need_capacity) - 3);
+  int actual_capacity = (1 << (fun_idx + 3));
+  int num_of_subwarp = threads_per_block0 / std::min<int>(32, actual_capacity);
+  int smem_size = cu_ctc::topk::calc_smem_size_for_block_wide<float, int>(
+      num_of_subwarp, beam);
+  BitonicTopkFuns[fun_idx]<<<grid, block, smem_size, stream>>>(
+      (*log_prob_struct),
+      step,
+      ptable,
+      ptablen,
+      lc,
+      ldc,
+      beam,
+      bs,
+      topk_key_buff,
+      topk_value_buff,
+      ldc_fast_div);
+  constexpr int threads_per_block1 = 128;
+  constexpr int items_per_thread1 = 2;
+  const int items_per_batch = bxs * beam;
+  topk_reduce_and_copy_list_per_batch_kernel<
+      threads_per_block1,
+      items_per_thread1><<<bs, threads_per_block1, 0, stream>>>(
+      (*log_prob_struct),
+      step,
+      beam,
+      items_per_batch,
+      bs,
+      topk_key_buff,
+      topk_value_buff,
+      ldc,
+      ldbeam,
+      ldseq_len,
+      pprev,
+      ptable,
+      ptablen,
+      clast,
+      clen,
+      clen2,
+      clist,
+      clist2,
+      blid,
+      score);
+  return 0;
+};
+template <int BLOCK_SIZE, int ITEMS_PT>
+__global__ void init_log_prob_select_kernel(
+    LogProb log_prob_struct,
+    int blid,
+    float threshold) {
+  // select seqs that log_prob[blid]< threshold
+  int batch_id = blockIdx.x;
+  using BlockScanT = cub::BlockScan<int, BLOCK_SIZE>;
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+  int selected[ITEMS_PT];
+  int selected_scan[ITEMS_PT];
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PT; ITEM++) {
+    selected[ITEM] = 0;
+  }
+  const int tx = threadIdx.x;
+  int this_batch_seq_len = log_prob_struct.origin_seq_lens[batch_id];
+  int block_agg = 0;
+  for (int seq_id_offset = 0; seq_id_offset < this_batch_seq_len;
+       seq_id_offset += (BLOCK_SIZE * ITEMS_PT)) {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PT; ITEM++) {
+      int seq_id = seq_id_offset + ITEMS_PT * tx + ITEM;
+      if (seq_id < this_batch_seq_len) {
+        selected[ITEM] =
+            (log_prob_struct.at(batch_id, seq_id, blid) < threshold) ? 1 : 0;
+      } else {
+        selected[ITEM] = 0;
+      }
+    }
+    __syncthreads();
+    int block_agg_this_iter = 0;
+    BlockScanT{temp_storage}.ExclusiveSum(
+        selected, selected_scan, block_agg_this_iter);
+    __syncthreads();
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PT; ITEM++) {
+      int seq_id = seq_id_offset + ITEMS_PT * tx + ITEM;
+      if (selected[ITEM]) {
+        log_prob_struct.select_seqs
+            [batch_id * log_prob_struct.seq_len + selected_scan[ITEM] +
+             block_agg] = seq_id;
+      }
+    }
+    block_agg += block_agg_this_iter;
+  }
+  if (tx == 0) {
+    log_prob_struct.select_seq_lens[batch_id] = block_agg;
+  }
+}
+int init_log_prob_and_cal_max_select_seq_len(
+    LogProb* log_prob_struct,
+    int blid,
+    float threshold,
+    cudaStream_t stream) {
+  constexpr int BLOCK_SIZE = 128;
+  constexpr int ITEMS_PT = 4;
+  int bxs = log_prob_struct->batch;
+  init_log_prob_select_kernel<BLOCK_SIZE, ITEMS_PT>
+      <<<bxs, BLOCK_SIZE, 0, stream>>>((*log_prob_struct), blid, threshold);
+  // for simplicity ,  find max_select_seq_len on cpu
+  std::vector<int> select_seq_lens(bxs);
+  CUDA_CHECK(cudaMemcpyAsync(
+      select_seq_lens.data(),
+      log_prob_struct->select_seq_lens,
+      sizeof(int) * bxs,
+      cudaMemcpyDeviceToHost,
+      stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  int ret_max_select_seq_len =
+      *std::max_element(select_seq_lens.begin(), select_seq_lens.end());
+  return ret_max_select_seq_len;
+}
+// if the parity of select_seq_len is different from the max_select_seq_len,
+// their clist and clen need to be copy to another clist and clen
+template <int SUB_WARP_SIZE, int BLOCK_SIZE>
+__global__ void copy_list_len_for_diff_parity_kernel(
+    LogProb log_prob_struct,
+    int step,
+    int max_select_seq_len,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int bs,
+    int beam,
+    int ldbeam,
+    int ldseq_len) {
+  const int batch_id = blockIdx.y;
+  if (batch_id >= bs)
+    return;
+  int select_seq_len = log_prob_struct.select_seq_lens[batch_id];
+  if ((select_seq_len & 1) == (max_select_seq_len & 1))
+    return;
+  const int bx = blockIdx.x;
+  constexpr int beams_per_block = BLOCK_SIZE / SUB_WARP_SIZE;
+  const int tx = threadIdx.x;
+  const int sub_warp_id = tx / SUB_WARP_SIZE;
+  const int tid_in_sub_warp = tx % SUB_WARP_SIZE;
+  const int beamid = bx * beams_per_block + sub_warp_id;
+  if (beamid >= beam)
+    return;
+  int new_len = clen[batch_id * ldbeam + beamid];
+  if (tid_in_sub_warp == 0) {
+    clen2[batch_id * ldbeam + beamid] = new_len;
+  }
+  for (int id = tid_in_sub_warp; id < new_len; id += SUB_WARP_SIZE) {
+    clist2[batch_id * beam * ldseq_len + beamid * ldseq_len + id] =
+        clist[batch_id * beam * ldseq_len + beamid * ldseq_len + id];
+  }
+}
+__global__ void copy_list_len_for_diff_parity_simple_kernel(
+    LogProb log_prob_struct,
+    int step,
+    int max_select_seq_len,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int bs,
+    int beam,
+    int ldbeam,
+    int ldseq_len) {
+  const int batch_id = blockIdx.x;
+  if (batch_id >= bs)
+    return;
+  int select_seq_len = log_prob_struct.select_seq_lens[batch_id];
+  if ((select_seq_len & 1) == (max_select_seq_len & 1))
+    return;
+  const int tx = threadIdx.x;
+  for (int beamid = tx; beamid < beam; beamid += blockDim.x) {
+    int new_len = clen[batch_id * ldbeam + beamid];
+    clen2[batch_id * ldbeam + beamid] = new_len;
+    for (int i = 0; i < new_len; i++) {
+      clist2[batch_id * beam * ldseq_len + beamid * ldseq_len + i] =
+          clist[batch_id * beam * ldseq_len + beamid * ldseq_len + i];
+    }
+  }
+}
+int CTC_copy_list_len_for_differnet_parity(
+    LogProb* log_prob_struct,
+    int step,
+    int max_select_seq_len,
+    int* clen,
+    int* clen2,
+    int* clist,
+    int* clist2,
+    int bs,
+    int beam,
+    int ldbeam,
+    int ldseq_len,
+    cudaStream_t stream) {
+  constexpr int SUB_WARP_SIZE = 8;
+  constexpr int BLOCK_SIZE = 256;
+  const int beams_per_block = BLOCK_SIZE / SUB_WARP_SIZE;
+  const int bxs = (beam + beams_per_block - 1) / beams_per_block;
+  dim3 blocks_this_grid;
+  blocks_this_grid.x = bxs;
+  blocks_this_grid.y = bs;
+  blocks_this_grid.z = 1;
+  copy_list_len_for_diff_parity_kernel<SUB_WARP_SIZE, BLOCK_SIZE>
+      <<<blocks_this_grid, BLOCK_SIZE, 0, stream>>>(
+          (*log_prob_struct),
+          step,
+          max_select_seq_len,
+          clen,
+          clen2,
+          clist,
+          clist2,
+          bs,
+          beam,
+          ldbeam,
+          ldseq_len);
+  return 0;
+}
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/device_data_wrap.h
+++ b/torchaudio/csrc/cuctc/src/device_data_wrap.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "include/ctc_prefix_decoder_host.h"
+namespace cu_ctc {
+constexpr size_t ALIGN_BYTES = 128;
+constexpr int MAX_BLOCKS = 800;
+template <typename T>
+class DeviceDataWrap {
+ public:
+  DeviceDataWrap() : data_{}, size_in_bytes_{} {};
+  DeviceDataWrap(T* data_ptr, size_t size_in_byte)
+      : data_{data_ptr}, size_in_bytes_{size_in_byte} {};
+  void print(size_t offset, size_t size_in_element, int eles_per_row = 10)
+      const {
+    if ((offset + size_in_element) * sizeof(T) > size_in_bytes_) {
+      std::cerr
+          << " ERROR: in DeviceDataWrap print : offset+size_in_element > size_in_bytes_";
+      abort();
+    }
+    std::vector<T> host_data(size_in_element);
+    CUDA_CHECK(cudaMemcpy(
+        host_data.data(),
+        data_ + offset,
+        size_in_element * sizeof(T),
+        cudaMemcpyDeviceToHost));
+    for (int i = 0; i < size_in_element; ++i) {
+      if (i != 0 && (i % eles_per_row == 0)) {
+        std::cout << " \n";
+      }
+      std::cout << "[" << i << "]:" << host_data[i] << " ";
+    }
+    std::cout << "\n";
+  }
+  operator T*() {
+    return data_;
+  }
+  operator const T*() {
+    return const_cast<const T*>(data_);
+  }
+  T* data_ptr() const {
+    return data_;
+  }
+  size_t size_in_byte() const {
+    return size_in_bytes_;
+  }
+  void set_data_ptr(T* data_ptr) {
+    data_ = data_ptr;
+  }
+  void set_size_in_byte(size_t size_in_byte) {
+    size_in_bytes_ = size_in_byte;
+  }
+ private:
+  T* data_;
+  size_t size_in_bytes_;
+};
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/device_log_prob.cuh
+++ b/torchaudio/csrc/cuctc/src/device_log_prob.cuh
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+namespace cu_ctc {
+struct LogProb {
+  float* data_ptr;
+  int batch;
+  int seq_len;
+  int vocab_size;
+  int batch_stride;
+  int seq_len_stride;
+  int vocab_stride;
+  int* origin_seq_lens; // batchs
+  int* select_seqs; // batchs *seq_len;
+  int* select_seq_lens; // batchs
+  __device__ __forceinline__ float at(int batch_id, int seq_id, int char_id) {
+    return data_ptr
+        [batch_id * batch_stride + seq_id * seq_len_stride +
+         char_id * vocab_stride];
+  }
+  __device__ __forceinline__ int ith_selected_seq_in_this_batch(
+      int batch_id,
+      int i) {
+    return select_seqs[batch_id * seq_len + i];
+  }
+  __device__ __forceinline__ bool need_process_on_ith_step(
+      int batch_id,
+      int istep) {
+    return istep < select_seq_lens[batch_id];
+  }
+  /**
+   * @brief if the prob of blank in  next original timestep > threshold , we
+   * will not process the next original timestep, but will process the
+   * subsequent blank on the currently processed timestep.
+   *
+   * @param batch_id
+   * @param istep
+   * @return __device__
+   */
+  __device__ __forceinline__ bool need_add_blank(int batch_id, int istep) {
+    if ((istep < 0) || (istep + 1) >= select_seq_lens[batch_id]) {
+      return false;
+    }
+    if ((ith_selected_seq_in_this_batch(batch_id, istep + 1) -
+         ith_selected_seq_in_this_batch(batch_id, istep)) > 1) {
+      return true;
+    }
+    return false;
+  }
+};
+} // namespace cu_ctc
--- a/torchaudio/csrc/cuctc/src/python_binding.cpp
+++ b/torchaudio/csrc/cuctc/src/python_binding.cpp
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include "include/ctc_prefix_decoder.h"
+namespace py = pybind11;
+std::tuple<size_t, std::vector<std::vector<std::pair<float, std::vector<int>>>>>
+ctc_prefix_decoder_batch_wrapper(
+    std::uintptr_t n_inter_data,
+    std::uintptr_t buff_ptr,
+    size_t buff_size,
+    std::uintptr_t pp,
+    std::uintptr_t seq_len_ptr,
+    const std::vector<int>& pp_sizes,
+    const std::vector<int>& pp_strides,
+    int beam,
+    int blid,
+    int spid,
+    float thresold) {
+  using SCORE_TYPE =
+      std::vector<std::vector<std::pair<float, std::vector<int>>>>;
+  cu_ctc::InternalData* inter_data = (cu_ctc::InternalData*)(n_inter_data);
+  auto [require_size, max_select_seq_len] =
+      cu_ctc::calculate_require_buff_and_init_internal_data(
+          inter_data,
+          pp_sizes[0],
+          pp_sizes[1],
+          pp_sizes[2],
+          beam,
+          buff_ptr,
+          buff_size,
+          (float*)pp,
+          (int*)seq_len_ptr,
+          pp_sizes,
+          pp_strides,
+          blid,
+          thresold);
+  if (require_size > 0) {
+    return std::make_tuple(require_size, SCORE_TYPE{});
+  }
+  int batch_size = pp_sizes[0];
+  std::vector<int> list_data(batch_size * beam * max_select_seq_len);
+  std::vector<int> len_data(batch_size * beam);
+  std::vector<float> score(batch_size * beam);
+  cu_ctc::ctc_beam_search_decoder_batch_gpu(
+      inter_data,
+      (float*)pp,
+      blid,
+      spid,
+      list_data.data(),
+      len_data.data(),
+      score.data());
+  SCORE_TYPE score_hyps{};
+  score_hyps.reserve(batch_size);
+  for (int b = 0; b < batch_size; b++) {
+    score_hyps.push_back(std::vector<std::pair<float, std::vector<int>>>{});
+    score_hyps.back().reserve(beam);
+    for (int beam_id = 0; beam_id < beam; beam_id++) {
+      int len = len_data[b * beam + beam_id];
+      int offset = b * beam * max_select_seq_len + beam_id * max_select_seq_len;
+      std::vector<int> clist(
+          list_data.data() + offset, list_data.data() + offset + len);
+      score_hyps.back().push_back(
+          std::pair{score[b * beam + beam_id], std::move(clist)});
+    }
+  }
+  return std::make_tuple(require_size, std::move(score_hyps));
+}
+PYBIND11_MODULE(pybind11_prefixctc, m) {
+  m.doc() = "none";
+  m.def(
+      "ctc_beam_search_decoder_batch_gpu_v2",
+      &ctc_prefix_decoder_batch_wrapper,
+      "ctc prefix decoder  v2 computing on GPU");
+  m.def("prefixCTC_alloc", &cu_ctc::prefixCTC_alloc, "allocate internal data");
+  m.def("prefixCTC_free", &cu_ctc::prefixCTC_free, "free internal data");
+}
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
+set(
+  sources
+  ffmpeg.cpp
+  filter_graph.cpp
+  hw_context.cpp
+  stream_reader/buffer/chunked_buffer.cpp
+  stream_reader/buffer/unchunked_buffer.cpp
+  stream_reader/conversion.cpp
+  stream_reader/packet_buffer.cpp
+  stream_reader/post_process.cpp
+  stream_reader/stream_processor.cpp
+  stream_reader/stream_reader.cpp
+  stream_writer/encode_process.cpp
+  stream_writer/encoder.cpp
+  stream_writer/packet_writer.cpp
+  stream_writer/stream_writer.cpp
+  stream_writer/tensor_converter.cpp
+  compat.cpp
+  )
+set(
+  ext_sources
+  pybind/pybind.cpp
+  )
+if (USE_CUDA)
+  set(
+    additional_lib
+    cuda_deps)
+endif()
+if (TARGET ffmpeg)
+  torchaudio_library(
+    libtorchaudio_ffmpeg
+    "${sources}"
+    ""
+    "torch;ffmpeg;${additional_lib}"
+    ""
+    )
+  if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
+    torchaudio_extension(
+      _torchaudio_ffmpeg
+      "${ext_sources}"
+      ""
+      "libtorchaudio_ffmpeg"
+      "TORCHAUDIO_FFMPEG_EXT_NAME=_torchaudio_ffmpeg"
+      )
+  endif()
+else()
+  torchaudio_library(
+    libtorchaudio_ffmpeg4
+    "${sources}"
+    ""
+    "torch;ffmpeg4;${additional_lib}"
+    ""
+    )
+  torchaudio_library(
+    libtorchaudio_ffmpeg5
+    "${sources}"
+    ""
+    "torch;ffmpeg5;${additional_lib}"
+    ""
+    )
+  torchaudio_library(
+    libtorchaudio_ffmpeg6
+    "${sources}"
+    ""
+    "torch;ffmpeg6;${additional_lib}"
+    ""
+    )
+  if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
+    torchaudio_extension(
+      _torchaudio_ffmpeg4
+      "${ext_sources}"
+      ""
+      "libtorchaudio_ffmpeg4"
+      "TORCHAUDIO_FFMPEG_EXT_NAME=_torchaudio_ffmpeg4"
+      )
+    torchaudio_extension(
+      _torchaudio_ffmpeg5
+      "${ext_sources}"
+      ""
+      "libtorchaudio_ffmpeg5"
+      "TORCHAUDIO_FFMPEG_EXT_NAME=_torchaudio_ffmpeg5"
+      )
+    torchaudio_extension(
+      _torchaudio_ffmpeg6
+      "${ext_sources}"
+      ""
+      "libtorchaudio_ffmpeg6"
+      "TORCHAUDIO_FFMPEG_EXT_NAME=_torchaudio_ffmpeg6"
+      )
+  endif ()
+endif()
--- a/torchaudio/csrc/ffmpeg/compat.cpp
+++ b/torchaudio/csrc/ffmpeg/compat.cpp
+#include <torch/script.h>
+#include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
+#include <stdexcept>
+namespace torchaudio {
+namespace io {
+namespace {
+torch::Tensor _load_audio(
+    StreamReader& s,
+    int i,
+    const c10::optional<std::string>& filter,
+    const bool& channels_first) {
+  s.add_audio_stream(i, -1, -1, filter, {}, {});
+  s.process_all_packets();
+  auto chunk = s.pop_chunks()[0];
+  TORCH_CHECK(chunk, "Failed to decode audio.");
+  auto waveform = chunk.value().frames;
+  return channels_first ? waveform.transpose(0, 1) : waveform;
+}
+std::tuple<torch::Tensor, int64_t> load(
+    const std::string& src,
+    const c10::optional<std::string>& format,
+    const c10::optional<std::string>& filter,
+    const bool& channels_first) {
+  StreamReader s{src, format, {}};
+  auto i = s.find_best_audio_stream();
+  auto sample_rate = s.get_src_stream_info(i).sample_rate;
+  auto waveform = _load_audio(s, i, filter, channels_first);
+  return {waveform, sample_rate};
+}
+std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> info(
+    const std::string& src,
+    const c10::optional<std::string>& format) {
+  StreamReader s{src, format, {}};
+  auto i = s.find_best_audio_stream();
+  auto sinfo = s.get_src_stream_info(i);
+  int64_t num_frames = [&]() {
+    if (sinfo.num_frames == 0) {
+      torch::Tensor waveform = _load_audio(s, i, {}, false);
+      return waveform.size(0);
+    }
+    return sinfo.num_frames;
+  }();
+  return {
+      static_cast<int64_t>(sinfo.sample_rate),
+      static_cast<int64_t>(num_frames),
+      static_cast<int64_t>(sinfo.num_channels),
+      static_cast<int64_t>(sinfo.bits_per_sample),
+      sinfo.codec_name};
+}
+TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def("torchaudio::compat_load", &load);
+  m.def("torchaudio::compat_info", &info);
+}
+} // namespace
+} // namespace io
+} // namespace torchaudio
--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -5,8 +5,7 @@
 #include <string>
 #include <vector>
-namespace torchaudio {
+namespace torchaudio::io {
-namespace ffmpeg {
 ////////////////////////////////////////////////////////////////////////////////
 // AVDictionary
@@ -14,8 +13,8 @@ namespace ffmpeg {
 AVDictionary* get_option_dict(const c10::optional<OptionDict>& option) {
  AVDictionary* opt = nullptr;
  if (option) {
-    for (const auto& it : option.value()) {
+    for (auto const& [key, value] : option.value()) {
-      av_dict_set(&opt, it.key().c_str(), it.value().c_str(), 0);
+      av_dict_set(&opt, key.c_str(), value.c_str(), 0);
    }
  }
  return opt;
@@ -73,16 +72,13 @@ void AVPacketDeleter::operator()(AVPacket* p) {
  av_packet_free(&p);
 };
-namespace {
+AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper<AVPacket, AVPacketDeleter>(p) {}
-AVPacket* get_av_packet() {
-  AVPacket* pPacket = av_packet_alloc();
-  TORCH_CHECK(pPacket, "Failed to allocate AVPacket object.");
-  return pPacket;
-}
-} // namespace
-AVPacketPtr::AVPacketPtr()
+AVPacketPtr alloc_avpacket() {
-    : Wrapper<AVPacket, AVPacketDeleter>(get_av_packet()) {}
+  AVPacket* p = av_packet_alloc();
+  TORCH_CHECK(p, "Failed to allocate AVPacket object.");
+  return AVPacketPtr{p};
+}
 ////////////////////////////////////////////////////////////////////////////////
 // AVPacket - buffer unref
@@ -101,15 +97,14 @@ AutoPacketUnref::operator AVPacket*() const {
 void AVFrameDeleter::operator()(AVFrame* p) {
  av_frame_free(&p);
 };
-namespace {
-AVFrame* get_av_frame() {
-  AVFrame* pFrame = av_frame_alloc();
-  TORCH_CHECK(pFrame, "Failed to allocate AVFrame object.");
-  return pFrame;
-}
-} // namespace
-AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}
+AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper<AVFrame, AVFrameDeleter>(p) {}
+AVFramePtr alloc_avframe() {
+  AVFrame* p = av_frame_alloc();
+  TORCH_CHECK(p, "Failed to allocate AVFrame object.");
+  return AVFramePtr{p};
+};
 ////////////////////////////////////////////////////////////////////////////////
 // AVCodecContext
@@ -128,15 +123,8 @@ void AutoBufferUnref::operator()(AVBufferRef* p) {
  av_buffer_unref(&p);
 }
-AVBufferRefPtr::AVBufferRefPtr()
+AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
-    : Wrapper<AVBufferRef, AutoBufferUnref>(nullptr) {}
+    : Wrapper<AVBufferRef, AutoBufferUnref>(p) {}
-void AVBufferRefPtr::reset(AVBufferRef* p) {
-  TORCH_CHECK(
-      !ptr,
-      "InternalError: A valid AVBufferRefPtr is being reset. Please file an issue.");
-  ptr.reset(p);
-}
 ////////////////////////////////////////////////////////////////////////////////
 // AVFilterGraph
@@ -145,18 +133,17 @@ void AVFilterGraphDeleter::operator()(AVFilterGraph* p) {
  avfilter_graph_free(&p);
 };
-namespace {
+AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
-AVFilterGraph* get_filter_graph() {
+    : Wrapper<AVFilterGraph, AVFilterGraphDeleter>(p) {}
-  AVFilterGraph* ptr = avfilter_graph_alloc();
-  TORCH_CHECK(ptr, "Failed to allocate resouce.");
-  return ptr;
-}
-} // namespace
-AVFilterGraphPtr::AVFilterGraphPtr()
-    : Wrapper<AVFilterGraph, AVFilterGraphDeleter>(get_filter_graph()) {}
-void AVFilterGraphPtr::reset() {
+////////////////////////////////////////////////////////////////////////////////
-  ptr.reset(get_filter_graph());
+// AVCodecParameters
+////////////////////////////////////////////////////////////////////////////////
+void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) {
+  avcodec_parameters_free(&codecpar);
 }
-} // namespace ffmpeg
-} // namespace torchaudio
+AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p)
+    : Wrapper<AVCodecParameters, AVCodecParametersDeleter>(p) {}
+} // namespace torchaudio::io
--- a/torchaudio/csrc/ffmpeg/ffmpeg.h
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
 // One stop header for all ffmepg needs
 #pragma once
-#include <torch/torch.h>
+#include <torch/types.h>
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -22,10 +22,12 @@ extern "C" {
 #include <libavutil/pixdesc.h>
 }
+/// @cond
 namespace torchaudio {
-namespace ffmpeg {
+namespace io {
-using OptionDict = c10::Dict<std::string, std::string>;
+using OptionDict = std::map<std::string, std::string>;
 // https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260
 // Starting from libavformat 59 (ffmpeg 5),
@@ -52,7 +54,6 @@ av_always_inline std::string av_err2string(int errnum) {
 // The resource allocation will be provided by custom constructors.
 template <typename T, typename Deleter>
 class Wrapper {
- protected:
  std::unique_ptr<T, Deleter> ptr;
 public:
@@ -121,9 +122,11 @@ struct AVPacketDeleter {
 };
 struct AVPacketPtr : public Wrapper<AVPacket, AVPacketDeleter> {
-  AVPacketPtr();
+  explicit AVPacketPtr(AVPacket* p);
 };
+AVPacketPtr alloc_avpacket();
 ////////////////////////////////////////////////////////////////////////////////
 // AVPacket - buffer unref
 ////////////////////////////////////////////////////////////////////////////////
@@ -150,9 +153,11 @@ struct AVFrameDeleter {
 };
 struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
-  AVFramePtr();
+  explicit AVFramePtr(AVFrame* p);
 };
+AVFramePtr alloc_avframe();
 ////////////////////////////////////////////////////////////////////////////////
 // AutoBufferUnrer is responsible for performing unref at the end of lifetime
 // of AVBufferRefPtr.
@@ -162,8 +167,7 @@ struct AutoBufferUnref {
 };
 struct AVBufferRefPtr : public Wrapper<AVBufferRef, AutoBufferUnref> {
-  AVBufferRefPtr();
+  explicit AVBufferRefPtr(AVBufferRef* p);
-  void reset(AVBufferRef* p);
 };
 ////////////////////////////////////////////////////////////////////////////////
@@ -184,8 +188,27 @@ struct AVFilterGraphDeleter {
  void operator()(AVFilterGraph* p);
 };
 struct AVFilterGraphPtr : public Wrapper<AVFilterGraph, AVFilterGraphDeleter> {
-  AVFilterGraphPtr();
+  explicit AVFilterGraphPtr(AVFilterGraph* p);
-  void reset();
+};
+////////////////////////////////////////////////////////////////////////////////
+// AVCodecParameters
+////////////////////////////////////////////////////////////////////////////////
+struct AVCodecParametersDeleter {
+  void operator()(AVCodecParameters* p);
+};
+struct AVCodecParametersPtr
+    : public Wrapper<AVCodecParameters, AVCodecParametersDeleter> {
+  explicit AVCodecParametersPtr(AVCodecParameters* p);
 };
-} // namespace ffmpeg
+struct StreamParams {
+  AVCodecParametersPtr codec_params{nullptr};
+  AVRational time_base{};
+  int stream_index{};
+};
+} // namespace io
 } // namespace torchaudio
+/// @endcond
--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
 #include <stdexcept>
-namespace torchaudio {
+namespace torchaudio::io {
-namespace ffmpeg {
-FilterGraph::FilterGraph(AVMediaType media_type) : media_type(media_type) {
+namespace {
-  switch (media_type) {
+AVFilterGraph* get_filter_graph() {
-    case AVMEDIA_TYPE_AUDIO:
+  AVFilterGraph* ptr = avfilter_graph_alloc();
-    case AVMEDIA_TYPE_VIDEO:
+  TORCH_CHECK(ptr, "Failed to allocate resouce.");
-      break;
+  ptr->nb_threads = 1;
-    default:
+  return ptr;
-      TORCH_CHECK(false, "Only audio and video type is supported.");
-  }
 }
+} // namespace
+FilterGraph::FilterGraph() : graph(get_filter_graph()) {}
 ////////////////////////////////////////////////////////////////////////////////
 // Configuration methods
@@ -39,6 +39,7 @@ std::string get_audio_src_args(
 std::string get_video_src_args(
    AVPixelFormat format,
    AVRational time_base,
+    AVRational frame_rate,
    int width,
    int height,
    AVRational sample_aspect_ratio) {
@@ -46,12 +47,14 @@ std::string get_video_src_args(
  std::snprintf(
      args,
      sizeof(args),
-      "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:pixel_aspect=%d/%d",
+      "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d",
      width,
      height,
      av_get_pix_fmt_name(format),
      time_base.num,
      time_base.den,
+      frame_rate.num,
+      frame_rate.den,
      sample_aspect_ratio.num,
      sample_aspect_ratio.den);
  return std::string(args);
@@ -64,41 +67,43 @@ void FilterGraph::add_audio_src(
    AVRational time_base,
    int sample_rate,
    uint64_t channel_layout) {
-  TORCH_CHECK(
+  add_src(
-      media_type == AVMEDIA_TYPE_AUDIO, "The filter graph is not audio type.");
+      avfilter_get_by_name("abuffer"),
-  std::string args =
+      get_audio_src_args(format, time_base, sample_rate, channel_layout));
-      get_audio_src_args(format, time_base, sample_rate, channel_layout);
-  add_src(args);
 }
 void FilterGraph::add_video_src(
    AVPixelFormat format,
    AVRational time_base,
+    AVRational frame_rate,
    int width,
    int height,
    AVRational sample_aspect_ratio) {
-  TORCH_CHECK(
+  add_src(
-      media_type == AVMEDIA_TYPE_VIDEO, "The filter graph is not video type.");
+      avfilter_get_by_name("buffer"),
-  std::string args =
+      get_video_src_args(
-      get_video_src_args(format, time_base, width, height, sample_aspect_ratio);
+          format, time_base, frame_rate, width, height, sample_aspect_ratio));
-  add_src(args);
 }
-void FilterGraph::add_src(const std::string& args) {
+void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
-  const AVFilter* buffersrc = avfilter_get_by_name(
-      media_type == AVMEDIA_TYPE_AUDIO ? "abuffer" : "buffer");
  int ret = avfilter_graph_create_filter(
-      &buffersrc_ctx, buffersrc, "in", args.c_str(), NULL, pFilterGraph);
+      &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph);
  TORCH_CHECK(
      ret >= 0,
      "Failed to create input filter: \"" + args + "\" (" + av_err2string(ret) +
          ")");
 }
-void FilterGraph::add_sink() {
+void FilterGraph::add_audio_sink() {
+  add_sink(avfilter_get_by_name("abuffersink"));
+}
+void FilterGraph::add_video_sink() {
+  add_sink(avfilter_get_by_name("buffersink"));
+}
+void FilterGraph::add_sink(const AVFilter* buffersink) {
  TORCH_CHECK(!buffersink_ctx, "Sink buffer is already allocated.");
-  const AVFilter* buffersink = avfilter_get_by_name(
-      media_type == AVMEDIA_TYPE_AUDIO ? "abuffersink" : "buffersink");
  // Note
  // Originally, the code here followed the example
  // https://ffmpeg.org/doxygen/4.1/filtering_audio_8c-example.html
@@ -109,7 +114,7 @@ void FilterGraph::add_sink() {
  // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html
  // `abuffersink` should not take options, and this resolved issue.
  int ret = avfilter_graph_create_filter(
-      &buffersink_ctx, buffersink, "out", nullptr, nullptr, pFilterGraph);
+      &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph);
  TORCH_CHECK(ret >= 0, "Failed to create output filter.");
 }
@@ -151,7 +156,7 @@ void FilterGraph::add_process(const std::string& filter_description) {
  InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx};
  int ret = avfilter_graph_parse_ptr(
-      pFilterGraph, filter_description.c_str(), out, in, nullptr);
+      graph, filter_description.c_str(), out, in, nullptr);
  TORCH_CHECK(
      ret >= 0,
@@ -159,14 +164,69 @@ void FilterGraph::add_process(const std::string& filter_description) {
          av_err2string(ret) + ".)");
 }
-void FilterGraph::create_filter() {
+void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
-  int ret = avfilter_graph_config(pFilterGraph, nullptr);
+  buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
+  int ret = avfilter_graph_config(graph, nullptr);
  TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
-  // char* desc = avfilter_graph_dump(pFilterGraph.get(), NULL);
+  // char* desc = avfilter_graph_dump(graph, NULL);
  // std::cerr << "Filter created:\n" << desc << std::endl;
  // av_free(static_cast<void*>(desc));
 }
+//////////////////////////////////////////////////////////////////////////////
+// Query methods
+//////////////////////////////////////////////////////////////////////////////
+FilterGraphOutputInfo FilterGraph::get_output_info() const {
+  TORCH_INTERNAL_ASSERT(buffersink_ctx, "FilterGraph is not initialized.");
+  AVFilterLink* l = buffersink_ctx->inputs[0];
+  FilterGraphOutputInfo ret{};
+  ret.type = l->type;
+  ret.format = l->format;
+  ret.time_base = l->time_base;
+  switch (l->type) {
+    case AVMEDIA_TYPE_AUDIO: {
+      ret.sample_rate = l->sample_rate;
+#if LIBAVFILTER_VERSION_MAJOR >= 8 && LIBAVFILTER_VERSION_MINOR >= 44
+      ret.num_channels = l->ch_layout.nb_channels;
+#else
+      // Before FFmpeg 5.1
+      ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
+#endif
+      break;
+    }
+    case AVMEDIA_TYPE_VIDEO: {
+      // If this is CUDA, retrieve the software pixel format from HW frames
+      // context.
+      if (l->format == AV_PIX_FMT_CUDA) {
+        // Originally, we were expecting that filter graph would propagate the
+        // HW frames context, so that we can retrieve it from the sink link.
+        // However, this is sometimes not the case.
+        // We do not know what is causing this behavior (GPU? libavfilter?
+        // format?) we resort to the source link in such case.
+        //
+        // (Technically, filters like scale_cuda could change the pixel format.
+        // We expect that hw_frames_ctx is propagated in such cases, but we do
+        // not know.
+        // TODO: check how scale_cuda interferes.
+        auto frames_ctx = [&]() -> AVHWFramesContext* {
+          if (l->hw_frames_ctx) {
+            return (AVHWFramesContext*)(l->hw_frames_ctx->data);
+          }
+          return (AVHWFramesContext*)(buffersrc_ctx->outputs[0]
+                                          ->hw_frames_ctx->data);
+        }();
+        ret.format = frames_ctx->sw_format;
+      }
+      ret.frame_rate = l->frame_rate;
+      ret.height = l->h;
+      ret.width = l->w;
+      break;
+    }
+    default:;
+  }
+  return ret;
+}
 ////////////////////////////////////////////////////////////////////////////////
 // Streaming process
 //////////////////////////////////////////////////////////////////////////////
@@ -179,5 +239,4 @@ int FilterGraph::get_frame(AVFrame* pOutputFrame) {
  return av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
 }
-} // namespace ffmpeg
+} // namespace torchaudio::io
-} // namespace torchaudio