merge v0.5.0

f48954a4 · zhuwenwen · 1dba29d3 · 8f89d720 · f48954a4 · f48954a4
Commit f48954a4 authored Jun 12, 2024 by zhuwenwen
20 changed files
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-list(APPEND CXX_COMPILE_FLAGS 
+list(APPEND CXX_COMPILE_FLAGS
    "-fopenmp"
    "-DVLLM_CPU_EXTENSION")

@@ -44,8 +44,8 @@ if (AVX512_FOUND)

    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
        else()
            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@@ -73,7 +73,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/pybind.cpp")
+    "csrc/cpu/torch_bindings.cpp")

 define_gpu_extension_target(
    _C
@@ -81,10 +81,10 @@ define_gpu_extension_target(
    LANGUAGE CXX
    SOURCES ${VLLM_EXT_SRC}
    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    WITH_SOABI 
+    USE_SABI 3
+    WITH_SOABI
 )

 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
-
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -5,7 +5,7 @@
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module)
+  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
@@ -295,6 +295,7 @@ endmacro()
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
@@ -302,7 +303,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
    GPU
    "WITH_SOABI"
-    "DESTINATION;LANGUAGE"
+    "DESTINATION;LANGUAGE;USE_SABI"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")

  # Add hipify preprocessing step when building with HIP/ROCm.
@@ -316,7 +317,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
    set(GPU_WITH_SOABI)
  endif()

-  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
+  if (GPU_USE_SABI)
+    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()

  if (GPU_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.

--- a/collect_env.py
+++ b/collect_env.py
@@ -64,6 +64,7 @@ DEFAULT_CONDA_PATTERNS = {
    "triton",
    "optree",
    "nccl",
+    "transformers",
 }

 DEFAULT_PIP_PATTERNS = {
@@ -75,6 +76,7 @@ DEFAULT_PIP_PATTERNS = {
    "optree",
    "onnx",
    "nccl",
+    "transformers",
 }


@@ -601,6 +603,11 @@ Versions of relevant libraries:
 {conda_packages}
 """.strip()

+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n"
+
 env_info_fmt += """
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>

 #include <cmath>

--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -17,7 +17,7 @@
 * limitations under the License.
 */

-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
@@ -808,16 +808,17 @@ void paged_attention_v1(
    torch::Tensor&
        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
    torch::Tensor&
-        value_cache,   // [num_blocks, num_heads, head_size, block_size]
-    int num_kv_heads,  // [num_heads]
-    float scale,
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
-    int block_size, int max_seq_len,
+    int64_t block_size, int64_t max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);

  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
@@ -972,16 +973,17 @@ void paged_attention_v2(
    torch::Tensor&
        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
    torch::Tensor&
-        value_cache,   // [num_blocks, num_heads, head_size, block_size]
-    int num_kv_heads,  // [num_heads]
-    float scale,
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
-    int block_size, int max_seq_len,
+    int64_t block_size, int64_t max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);
  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
                             CALL_V2_LAUNCHER_BLOCK_SIZE)
@@ -990,4 +992,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
--- a/csrc/cache.h
+++ b/csrc/cache.h
 #pragma once

-#include <torch/extension.h>
+#include <torch/all.h>

 #include <map>
 #include <vector>
@@ -8,14 +8,18 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);

-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                 const torch::Tensor& block_mapping);

 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, const float kv_scale);
+                       const std::string& kv_cache_dtype,
+                       const double kv_scale);

 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                             torch::Tensor& key_cache,
@@ -25,4 +29,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,

 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const float scale, const std::string& kv_cache_dtype);
+                 const double scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@@ -95,8 +95,11 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,

 }  // namespace vllm

-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                 const torch::Tensor& block_mapping) {
  int num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@@ -255,7 +258,7 @@ void reshape_and_cache(
    torch::Tensor&
        value_cache,  // [num_blocks, num_heads, head_size, block_size]
    torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const float kv_scale) {
+    const std::string& kv_cache_dtype, const double kv_scale) {
  int num_tokens = key.size(0);
  int num_heads = key.size(1);
  int head_size = key.size(2);
@@ -334,7 +337,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,

 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const float kv_scale, const std::string& kv_cache_dtype) {
+                 const double kv_scale, const std::string& kv_cache_dtype) {
  torch::Device src_device = src_cache.device();
  torch::Device dst_device = dst_cache.device();
  TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")

--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -420,12 +420,13 @@ void paged_attention_v1_impl_launcher(

 void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
  TORCH_CHECK(kv_scale == 1.0f);
  TORCH_CHECK(blocksparse_vert_stride <= 1,
              "CPU backend does not support blocksparse attention yet.");
@@ -738,12 +739,13 @@ void paged_attention_v2_impl_launcher(
 void paged_attention_v2(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
  TORCH_CHECK(kv_scale == 1.0f);
  TORCH_CHECK(blocksparse_vert_stride <= 1,
              "CPU backend does not support blocksparse attention yet.");

--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -5,8 +5,8 @@

 namespace {
 template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
-                          std::vector<torch::Tensor>& value_caches,
+void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
+                          std::vector<torch::Tensor> const& value_caches,
                          const torch::Tensor& mapping_pairs,
                          const int element_num_per_block,
                          const int layer_num) {
@@ -82,8 +82,11 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace

-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                 const torch::Tensor& block_mapping) {
  unsigned num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@@ -104,7 +107,7 @@ void copy_blocks(std::vector<torch::Tensor>& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, float kv_scale) {
+                       const std::string& kv_cache_dtype, double kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);

  int num_tokens = key.size(0);

--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -3,7 +3,7 @@
 #define CPU_TYPES_HPP

 #include <immintrin.h>
-#include <torch/extension.h>
+#include <torch/all.h>

 namespace vec_op {


--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
 }  // namespace

 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              float epsilon) {
+              double epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

@@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 }

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, float epsilon) {
+                        torch::Tensor& weight, double epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;


--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -21,73 +21,74 @@ void rotary_embedding_impl(
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();

  const int embed_dim = rot_dim / 2;
-  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
+  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
+  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;

-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    int64_t pos = positions[token_idx];
-    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
+                          scalar_t* qk) {
+    int j = 0;
+    for (; j < loop_upper; j += VEC_ELEM_NUM) {
+      const int rot_offset = j;
+      const int x_index = rot_offset;
+      const int y_index = embed_dim + rot_offset;

-    for (int i = 0; i < num_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head =
-          token_idx * query_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
+      const int64_t out_x = token_head + x_index;
+      const int64_t out_y = token_head + y_index;

-        const int64_t out_x = token_head + x_index;
-        const int64_t out_y = token_head + y_index;
+      const scalar_vec_t cos(cache_ptr + x_index);
+      const scalar_vec_t sin(cache_ptr + y_index);

-        const scalar_vec_t cos(cache_ptr + x_index);
-        const scalar_vec_t sin(cache_ptr + y_index);
+      const scalar_vec_t q_x(qk + out_x);
+      const scalar_vec_t q_y(qk + out_y);

-        const scalar_vec_t q_x(query + out_x);
-        const scalar_vec_t q_y(query + out_y);
+      vec_op::FP32Vec8 fp32_cos(cos);
+      vec_op::FP32Vec8 fp32_sin(sin);

-        vec_op::FP32Vec8 fp32_cos(cos);
-        vec_op::FP32Vec8 fp32_sin(sin);
+      vec_op::FP32Vec8 fp32_q_x(q_x);
+      vec_op::FP32Vec8 fp32_q_y(q_y);

-        vec_op::FP32Vec8 fp32_q_x(q_x);
-        vec_op::FP32Vec8 fp32_q_y(q_y);
+      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+      scalar_vec_t(out1).save(qk + out_x);

-        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-        scalar_vec_t(out1).save(query + out_x);
-
-        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
-        scalar_vec_t(out2).save(query + out_y);
-      }
+      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      scalar_vec_t(out2).save(qk + out_y);
    }
-
-    for (int i = 0; i < num_kv_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
+    if (!flag) {
+      for (; j < embed_dim; ++j) {
+        const int x_index = j;
+        const int y_index = embed_dim + j;

        const int64_t out_x = token_head + x_index;
        const int64_t out_y = token_head + y_index;

-        const scalar_vec_t cos(cache_ptr + x_index);
-        const scalar_vec_t sin(cache_ptr + y_index);
+        const float fp32_cos = cache_ptr[x_index];
+        const float fp32_sin = cache_ptr[y_index];

-        const scalar_vec_t k_x(key + out_x);
-        const scalar_vec_t k_y(key + out_y);
+        const float fp32_q_x = qk[out_x];
+        const float fp32_q_y = qk[out_y];

-        vec_op::FP32Vec8 fp32_cos(cos);
-        vec_op::FP32Vec8 fp32_sin(sin);
+        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      }
+    }
+  };

-        vec_op::FP32Vec8 fp32_k_x(k_x);
-        vec_op::FP32Vec8 fp32_k_y(k_y);
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;

-        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
-        scalar_vec_t(out1).save(key + out_x);
-        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
-        scalar_vec_t(out2).save(key + out_y);
-      }
+    for (int i = 0; i < num_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, query);
+    }
+
+    for (int i = 0; i < num_kv_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, key);
    }
  }
 }
@@ -167,7 +168,7 @@ void rotary_embedding_gptj_impl(
 };  // namespace

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int head_size,
+                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
  int num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);

--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
-#include "cache.h"
-#include "cuda_utils.h"
-#include "ops.h"
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // vLLM custom ops
-  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-
-  // Attention ops
-  ops.def("paged_attention_v1", &paged_attention_v1,
-          "Compute the attention between an input query and the cached "
-          "keys/values using PagedAttention.");
-  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
-
-  // Activation ops
-  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
-  ops.def("gelu_and_mul", &gelu_and_mul,
-          "Activation function used in GeGLU with `none` approximation.");
-  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
-          "Activation function used in GeGLU with `tanh` approximation.");
-  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
-  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
-
-  // Layernorm
-  ops.def("rms_norm", &rms_norm,
-          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-
-  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
-          "In-place fused Add and RMS Normalization");
-
-  // Rotary embedding
-  ops.def("rotary_embedding", &rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-
-  // Cache ops
-  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def("swap_blocks", &swap_blocks,
-                "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def("copy_blocks", &copy_blocks,
-                "Copy the cache blocks from src to dst");
-  cache_ops.def("reshape_and_cache", &reshape_and_cache,
-                "Reshape the key and value tensors and cache them");
-}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
+#include "cache.h"
+#include "ops.h"
+#include "registration.h"
+
+#include <torch/library.h>
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+
+  // Attention ops
+  // Compute the attention between an input query and the cached keys/values
+  // using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
+
+  // Activation ops
+
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCPU, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCPU, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor! key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  float kv_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@@ -19,8 +19,12 @@
 #ifndef USE_ROCM
  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
 #else
  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
 #endif

 #ifndef USE_ROCM

--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
 #pragma once

-#include <torch/extension.h>
+int64_t get_device_attribute(int64_t attribute, int64_t device_id);

-int get_device_attribute(int attribute, int device_id);
-
-int get_max_shared_memory_per_block_device_attribute(int device_id);
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@@ -2,7 +2,7 @@
  #include <hip/hip_runtime.h>
  #include <hip/hip_runtime_api.h>
 #endif
-int get_device_attribute(int attribute, int device_id) {
+int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
  int device, value;
  if (device_id < 0) {
    cudaGetDevice(&device);
@@ -14,8 +14,8 @@ int get_device_attribute(int attribute, int device_id) {
  return value;
 }

-int get_max_shared_memory_per_block_device_attribute(int device_id) {
-  int attribute;
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
+  int64_t attribute;
  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
  // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74


--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <torch/extension.h>
+#include <torch/all.h>

 #include "custom_all_reduce.cuh"

-// fake pointer type
-using fptr_t = uint64_t;
+// fake pointer type, must match fptr_t type in ops.h
+using fptr_t = int64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));

 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int rank,
+                      const std::vector<int64_t>& offsets, int64_t rank,
                      bool full_nvlink) {
  int world_size = offsets.size();
  if (world_size > 8)
@@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
          t.numel() * t.element_size());
 }

-bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
+bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
                      bool full_nvlink) {
  auto inp_size = inp.numel() * inp.element_size();
  // custom allreduce requires input byte size to be multiples of 16
@@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
  delete fa;
 }

-int meta_size() { return sizeof(vllm::Signal); }
+int64_t meta_size() { return sizeof(vllm::Signal); }

 void register_buffer(fptr_t _fa, torch::Tensor& t,
                     const std::vector<std::string>& handles,
@@ -134,10 +134,16 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
  fa->register_buffer(handles, offsets, t.data_ptr());
 }

-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
    fptr_t _fa) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  return fa->get_graph_buffer_ipc_meta();
+  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handles =
+      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
+  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
+  return {handles, std::move(offsets)};
 }

 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,

--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -4,7 +4,7 @@
 */
 #pragma once

-#include <torch/extension.h>
+#include <torch/all.h>

 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \

--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
              torch::Tensor& input,   // [..., hidden_size]
              torch::Tensor& weight,  // [hidden_size]
-              float epsilon) {
+              double epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

@@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
 void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
-                        float epsilon) {
+                        double epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;