[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#4722)

5f6d10c1 · Michael Goin · GitHub · 9b9a10d6 · 5f6d10c1 · 5f6d10c1
Unverified Commit 5f6d10c1 authored May 22, 2024 by Michael Goin Committed by GitHub May 22, 2024
20 changed files
--- a/.clang-format
+++ b/.clang-format
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+    Priority:        3
+  - Regex:           '^"(qoda|\.\.)/'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        1
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
+name: clang-format
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install clang-format==18.1.5
+    - name: Running clang-format
+      run: |
+        EXCLUDES=(
+            'csrc/moe/topk_softmax_kernels.cu'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
+        )
+        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+            | xargs clang-format --dry-run --Werror
\ No newline at end of file
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -10,11 +10,11 @@
 namespace vllm {

 // Activation and gating kernel template.
-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void act_and_mul_kernel(
-  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [..., 2, d]
-  const int d) {
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
  const int64_t token_idx = blockIdx.x;
  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,72 +23,66 @@ __global__ void act_and_mul_kernel(
  }
 }

-template<typename T>
+template <typename T>
 __device__ __forceinline__ T silu_kernel(const T& x) {
  // x * sigmoid(x)
-  return (T) (((float) x) / (1.0f + expf((float) -x)));
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
 }

-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'none' approximation.
  // Refer to:
  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
-  const float f = (float) x;
+  const float f = (float)x;
  constexpr float ALPHA = M_SQRT1_2;
-  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }

-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'tanh' approximation.
  // Refer to:
  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
-  const float f = (float) x;
+  const float f = (float)x;
  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
  constexpr float KAPPA = 0.044715;
  float x_cube = f * f * f;
  float inner = BETA * (f + KAPPA * x_cube);
-  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
 }

-} // namespace vllm
+}  // namespace vllm

 // Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
-  int d = input.size(-1) / 2;                                                             \
-  int64_t num_tokens = input.numel() / input.size(-1);                                    \
-  dim3 grid(num_tokens);                                                                  \
-  dim3 block(std::min(d, 1024));                                                          \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
-    input.scalar_type(),                                                                  \
-    "act_and_mul_kernel",                                                                 \
-    [&] {                                                                                 \
-      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
-        out.data_ptr<scalar_t>(),                                                         \
-        input.data_ptr<scalar_t>(),                                                       \
-        d);                                                                               \
-    });
-
-void silu_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  dim3 grid(num_tokens);                                                 \
+  dim3 block(std::min(d, 1024));                                         \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
+  VLLM_DISPATCH_FLOATING_TYPES(                                          \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                         input.data_ptr<scalar_t>(), d); \
+      });
+
+void silu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }

-void gelu_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }

-void gelu_tanh_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
 {
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
 }
@@ -96,11 +90,11 @@ void gelu_tanh_and_mul(
 namespace vllm {

 // Element-wise activation kernel template.
-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void activation_kernel(
-  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [..., d]
-  const int d) {
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const int d) {
  const int64_t token_idx = blockIdx.x;
  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
@@ -108,54 +102,49 @@ __global__ void activation_kernel(
  }
 }

-} // namespace vllm
+}  // namespace vllm

 // Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
-  int d = input.size(-1);                                                                 \
-  int64_t num_tokens = input.numel() / d;                                                 \
-  dim3 grid(num_tokens);                                                                  \
-  dim3 block(std::min(d, 1024));                                                          \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
-    input.scalar_type(),                                                                  \
-    "activation_kernel",                                                                  \
-    [&] {                                                                                 \
-      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
-        out.data_ptr<scalar_t>(),                                                         \
-        input.data_ptr<scalar_t>(),                                                       \
-        d);                                                                               \
-    });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                       \
+  int d = input.size(-1);                                                      \
+  int64_t num_tokens = input.numel() / d;                                      \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                        \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),                 \
+                                     input.data_ptr<scalar_t>(), d);           \
+  });

 namespace vllm {

-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_new_kernel(const T& x) {
-  const float x3 = (float) (x * x * x);
-  const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
-  return ((T) 0.5) * x * (((T) 1.0) + t);
+  const float x3 = (float)(x * x * x);
+  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+  return ((T)0.5) * x * (((T)1.0) + t);
 }

-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
-  const float f = (float) x;
-  const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
-  return ((T) 0.5) * x * (((T) 1.0) + t);
+  const float f = (float)x;
+  const T t =
+      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+  return ((T)0.5) * x * (((T)1.0) + t);
 }

-} // namespace vllm
+}  // namespace vllm

-void gelu_new(
-  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [..., d]
+void gelu_new(torch::Tensor& out,    // [..., d]
+              torch::Tensor& input)  // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
 }

-void gelu_fast(
-  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [..., d]
+void gelu_fast(torch::Tensor& out,    // [..., d]
+               torch::Tensor& input)  // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
--- a/csrc/attention/attention_generic.cuh
+++ b/csrc/attention/attention_generic.cuh
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 * Copyright (c) 2023, The vLLM team.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
@@ -22,31 +23,31 @@
 namespace vllm {

 // A vector type to store Q, K, V elements.
-template<typename T, int VEC_SIZE>
+template <typename T, int VEC_SIZE>
 struct Vec {};

 // A vector type to store FP32 accumulators.
-template<typename T>
+template <typename T>
 struct FloatVec {};

 // Template vector operations.
-template<typename Acc, typename A, typename B>
+template <typename Acc, typename A, typename B>
 inline __device__ Acc mul(A a, B b);

-template<typename T>
+template <typename T>
 inline __device__ float sum(T v);

-template<typename T>
+template <typename T>
 inline __device__ float dot(T a, T b) {
  return sum(mul<T, T, T>(a, b));
 }

-template<typename A, typename T>
+template <typename A, typename T>
 inline __device__ float dot(T a, T b) {
  return sum(mul<A, T, T>(a, b));
 }

-template<typename T>
+template <typename T>
 inline __device__ void zero(T& dst) {
  constexpr int WORDS = sizeof(T) / 4;
  union {
@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
  dst = tmp.raw;
 }

-} // namespace vllm
+}  // namespace vllm
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 * Copyright (c) 2023, The vLLM team.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
@@ -26,7 +27,7 @@
 namespace vllm {

 // Q*K^T operation.
-template<int THREAD_GROUP_SIZE, typename Vec, int N>
+template <int THREAD_GROUP_SIZE, typename Vec, int N>
 inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  using A_vec = typename FloatVec<Vec>::Type;
  // Compute the parallel products for Q*K^T (treat vector lanes separately).
@@ -45,12 +46,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  return qk;
 }

-template<typename T, int THREAD_GROUP_SIZE>
+template <typename T, int THREAD_GROUP_SIZE>
 struct Qk_dot {
-  template<typename Vec, int N>
+  template <typename Vec, int N>
  static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
    return qk_dot_<THREAD_GROUP_SIZE>(q, k);
  }
 };

-} // namespace vllm
+}  // namespace vllm
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 * Copyright (c) 2023, The vLLM team.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
@@ -28,8 +30,8 @@
  #include <hip/hip_bf16.h>
  #include <hip/hip_fp16.h>

-  typedef __hip_bfloat162 __nv_bfloat162;
-  typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
 #endif

 #include <stdint.h>
@@ -50,37 +52,37 @@ struct bf16_8_t {
 };

 // BF16 vector types for Q, K, V.
-template<>
+template <>
 struct Vec<__nv_bfloat16, 1> {
  using Type = __nv_bfloat16;
 };
-template<>
+template <>
 struct Vec<__nv_bfloat16, 2> {
  using Type = __nv_bfloat162;
 };
-template<>
+template <>
 struct Vec<__nv_bfloat16, 4> {
  using Type = bf16_4_t;
 };
-template<>
+template <>
 struct Vec<__nv_bfloat16, 8> {
  using Type = bf16_8_t;
 };

 // FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
 struct FloatVec<__nv_bfloat16> {
  using Type = float;
 };
-template<>
+template <>
 struct FloatVec<__nv_bfloat162> {
  using Type = float2;
 };
-template<>
+template <>
 struct FloatVec<bf16_4_t> {
  using Type = Float4_;
 };
-template<>
+template <>
 struct FloatVec<bf16_8_t> {
  using Type = Float8_;
 };
@@ -108,9 +110,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
  assert(false);
 #else
  #ifndef USE_ROCM
-    return a + b;
+  return a + b;
  #else
-    return __hadd(a, b);
+  return __hadd(a, b);
  #endif
 #endif
 }
@@ -161,7 +163,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
 }

 // Vector multiplication.
-template<>
+template <>
 inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
  assert(false);
@@ -170,7 +172,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
 #endif
 }

-template<>
+template <>
 inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
  assert(false);
@@ -179,12 +181,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
 #endif
 }

-template<>
+template <>
 inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
  return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
 }

-template<>
+template <>
 inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
  bf16_4_t c;
  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -192,7 +194,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
  return c;
 }

-template<>
+template <>
 inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
  __nv_bfloat162 s = bf162bf162(a);
  bf16_4_t c;
@@ -201,7 +203,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
  return c;
 }

-template<>
+template <>
 inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
  bf16_8_t c;
  c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -211,7 +213,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
  return c;
 }

-template<>
+template <>
 inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
  __nv_bfloat162 s = bf162bf162(a);
  bf16_8_t c;
@@ -222,26 +224,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
  return c;
 }

-template<>
+template <>
 inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
  float fa = __bfloat162float(a);
  float fb = __bfloat162float(b);
  return fa * fb;
 }

-template<>
+template <>
 inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
  float2 fa = bf1622float2(a);
  float2 fb = bf1622float2(b);
  return mul<float2, float2, float2>(fa, fb);
 }

-template<>
+template <>
 inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
  return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
 }

-template<>
+template <>
 inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
  Float4_ fc;
  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -249,7 +251,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
  return fc;
 }

-template<>
+template <>
 inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
  __nv_bfloat162 s = bf162bf162(a);
  Float4_ fc;
@@ -258,7 +260,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
  return fc;
 }

-template<>
+template <>
 inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
  Float8_ fc;
  fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -268,7 +270,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
  return fc;
 }

-template<>
+template <>
 inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
  __nv_bfloat162 s = bf162bf162(a);
  Float8_ fc;
@@ -280,7 +282,8 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
 }

 // Vector fused multiply-add.
-inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
+                                     __nv_bfloat162 c) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
  assert(false);
 #else
@@ -288,7 +291,8 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bf
 #endif
 }

-inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
+                                     __nv_bfloat162 c) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
  assert(false);
 #else
@@ -379,23 +383,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
 }

 // Vector sum.
-template<>
+template <>
 inline __device__ float sum(__nv_bfloat16 v) {
  return __bfloat162float(v);
 }

-template<>
+template <>
 inline __device__ float sum(__nv_bfloat162 v) {
  float2 vf = bf1622float2(v);
  return vf.x + vf.y;
 }

-template<>
+template <>
 inline __device__ float sum(bf16_4_t v) {
  return sum(v.x) + sum(v.y);
 }

-template<>
+template <>
 inline __device__ float sum(bf16_8_t v) {
  return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
 }
@@ -448,4 +452,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
 #endif
 }

-} // namespace vllm
+}  // namespace vllm
--- a/csrc/attention/dtype_float16.cuh
+++ b/csrc/attention/dtype_float16.cuh
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 * Copyright (c) 2023, The vLLM team.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
@@ -30,37 +32,37 @@
 namespace vllm {

 // FP16 vector types for Q, K, V.
-template<>
+template <>
 struct Vec<uint16_t, 1> {
  using Type = uint16_t;
 };
-template<>
+template <>
 struct Vec<uint16_t, 2> {
  using Type = uint32_t;
 };
-template<>
+template <>
 struct Vec<uint16_t, 4> {
  using Type = uint2;
 };
-template<>
+template <>
 struct Vec<uint16_t, 8> {
  using Type = uint4;
 };

 // FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
 struct FloatVec<uint16_t> {
  using Type = float;
 };
-template<>
+template <>
 struct FloatVec<uint32_t> {
  using Type = float2;
 };
-template<>
+template <>
 struct FloatVec<uint2> {
  using Type = Float4_;
 };
-template<>
+template <>
 struct FloatVec<uint4> {
  using Type = Float8_;
 };
@@ -73,8 +75,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
  return b;
 #else
  union {
-   uint32_t u32;
-   uint16_t u16[2];
+    uint32_t u32;
+    uint16_t u16[2];
  } tmp;
  tmp.u16[0] = a;
  tmp.u16[1] = a;
@@ -130,10 +132,12 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
  } tmp;
 #ifndef USE_ROCM
  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+               : "=r"(tmp.u32)
+               : "f"(f.y), "f"(f.x));
  #else
-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
-    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
  #endif
 #else
  tmp.u16[0] = float_to_half(f.x);
@@ -201,7 +205,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
 }

 // Vector multiplication.
-template<>
+template <>
 inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
  uint16_t c;
 #ifndef USE_ROCM
@@ -212,7 +216,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
  return c;
 }

-template<>
+template <>
 inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
  uint32_t c;
 #ifndef USE_ROCM
@@ -223,12 +227,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
  return c;
 }

-template<>
+template <>
 inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
  return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
 }

-template<>
+template <>
 inline __device__ uint2 mul(uint2 a, uint2 b) {
  uint2 c;
  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
@@ -236,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ uint2 mul(uint16_t a, uint2 b) {
  uint32_t s = h0_h0(a);
  uint2 c;
@@ -245,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ uint4 mul(uint4 a, uint4 b) {
  uint4 c;
  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
@@ -255,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ uint4 mul(uint16_t a, uint4 b) {
  uint32_t s = h0_h0(a);
  uint4 c;
@@ -266,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ float mul(uint16_t a, uint16_t b) {
  float fa = half_to_float(a);
  float fb = half_to_float(b);
  return fa * fb;
 }

-template<>
+template <>
 inline __device__ float2 mul(uint32_t a, uint32_t b) {
  float2 fa = half2_to_float2(a);
  float2 fb = half2_to_float2(b);
  return mul<float2, float2, float2>(fa, fb);
 }

-template<>
+template <>
 inline __device__ float2 mul(uint16_t a, uint32_t b) {
  return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
 }

-template<>
+template <>
 inline __device__ Float4_ mul(uint2 a, uint2 b) {
  Float4_ fc;
  fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
@@ -293,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
  return fc;
 }

-template<>
+template <>
 inline __device__ Float4_ mul(uint16_t a, uint2 b) {
  uint32_t s = h0_h0(a);
  Float4_ fc;
@@ -302,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
  return fc;
 }

-template<>
+template <>
 inline __device__ Float8_ mul(uint4 a, uint4 b) {
  Float8_ fc;
  fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
@@ -312,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
  return fc;
 }

-template<>
+template <>
 inline __device__ Float8_ mul(uint16_t a, uint4 b) {
  uint32_t s = h0_h0(a);
  Float8_ fc;
@@ -327,9 +331,13 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
 inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
  uint32_t d;
 #ifndef USE_ROCM
-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
 #else
-  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+  asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
+               : "=v"(d)
+               : "v"(a), "v"(b), "v"(c));
 #endif
  return d;
 }
@@ -423,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
 }

 // Vector sum.
-template<>
+template <>
 inline __device__ float sum(uint16_t v) {
  return half_to_float(v);
 }

-template<>
+template <>
 inline __device__ float sum(uint32_t v) {
  float2 tmp = half2_to_float2(v);
  return tmp.x + tmp.y;
 }

-template<>
+template <>
 inline __device__ float sum(uint2 v) {
  uint32_t c = add(v.x, v.y);
  return sum(c);
 }

-template<>
+template <>
 inline __device__ float sum(uint4 v) {
  uint32_t c = add(v.x, v.y);
  c = add(c, v.z);
@@ -470,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
 }

 // From float16 to float32.
-inline __device__ float to_float(uint16_t u) {
-  return half_to_float(u);
-}
+inline __device__ float to_float(uint16_t u) { return half_to_float(u); }

-inline __device__ float2 to_float(uint32_t u) {
-  return half2_to_float2(u);
-}
+inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }

 inline __device__ Float4_ to_float(uint2 u) {
  Float4_ tmp;
@@ -495,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) {
 }

 // Zero-out a variable.
-inline __device__ void zero(uint16_t& dst) {
-  dst = uint16_t(0);
-}
+inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }

-} // namespace vllm
+}  // namespace vllm
--- a/csrc/attention/dtype_float32.cuh
+++ b/csrc/attention/dtype_float32.cuh
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 * Copyright (c) 2023, The vLLM team.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
@@ -38,37 +40,35 @@ struct Float8_ {
 };

 // FP32 vector types for Q, K, V.
-template<>
+template <>
 struct Vec<float, 1> {
  using Type = float;
 };
-template<>
+template <>
 struct Vec<float, 2> {
  using Type = float2;
 };
-template<>
+template <>
 struct Vec<float, 4> {
  using Type = float4;
 };

 // FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
 struct FloatVec<float> {
  using Type = float;
 };
-template<>
+template <>
 struct FloatVec<float2> {
  using Type = float2;
 };
-template<>
+template <>
 struct FloatVec<float4> {
  using Type = float4;
 };

 // Vector addition.
-inline __device__ float add(float a, float b) {
-  return a + b;
-}
+inline __device__ float add(float a, float b) { return a + b; }

 inline __device__ float2 add(float2 a, float2 b) {
  float2 c;
@@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
 }

 // Vector multiplication.
-template<>
+template <>
 inline __device__ float mul<float, float>(float a, float b) {
  return a * b;
 }

-template<>
+template <>
 inline __device__ float2 mul(float2 a, float2 b) {
  float2 c;
  c.x = a.x * b.x;
@@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ float2 mul(float a, float2 b) {
  float2 c;
  c.x = a * b.x;
@@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ float4 mul(float4 a, float4 b) {
  float4 c;
  c.x = a.x * b.x;
@@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
  return c;
 }

-template<>
+template <>
 inline __device__ float4 mul(float a, float4 b) {
  float4 c;
  c.x = a * b.x;
@@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) {
 }

 // Vector fused multiply-add.
-inline __device__ float fma(float a, float b, float c) {
-  return a * b + c;
-}
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }

 inline __device__ float2 fma(float2 a, float2 b, float2 c) {
  float2 d;
@@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
 }

 // Vector sum.
-template<>
+template <>
 inline __device__ float sum(float v) {
  return v;
 }

-template<>
+template <>
 inline __device__ float sum(float2 v) {
  return v.x + v.y;
 }

-template<>
+template <>
 inline __device__ float sum(float4 v) {
  return v.x + v.y + v.z + v.w;
 }

-template<>
+template <>
 inline __device__ float sum(Float4_ v) {
  return v.x.x + v.x.y + v.y.x + v.y.y;
 }

-template<>
+template <>
 inline __device__ float sum(Float8_ v) {
  return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
 }

 // Vector dot product.
-inline __device__ float dot(float a, float b) {
-  return a * b;
-}
+inline __device__ float dot(float a, float b) { return a * b; }

 inline __device__ float dot(float2 a, float2 b) {
  float2 c = mul<float2, float2, float2>(a, b);
@@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
 }

 // From float to float.
-inline __device__ void from_float(float& dst, float src) {
-  dst = src;
-}
+inline __device__ void from_float(float& dst, float src) { dst = src; }

-inline __device__ void from_float(float2& dst, float2 src) {
-  dst = src;
-}
+inline __device__ void from_float(float2& dst, float2 src) { dst = src; }

-inline __device__ void from_float(float4& dst, float4 src) {
-  dst = src;
-}
+inline __device__ void from_float(float4& dst, float4 src) { dst = src; }

 // From float to float.
-inline __device__ float to_float(float u) {
-  return u;
-}
+inline __device__ float to_float(float u) { return u; }

-inline __device__ float2 to_float(float2 u) {
-  return u;
-}
+inline __device__ float2 to_float(float2 u) { return u; }

-inline __device__ float4 to_float(float4 u) {
-  return u;
-}
+inline __device__ float4 to_float(float4 u) { return u; }

-inline __device__ Float4_ to_float(Float4_ u) {
-  return u;
-}
+inline __device__ Float4_ to_float(Float4_ u) { return u; }

-inline __device__ Float8_ to_float(Float8_ u) {
-  return u;
-}
+inline __device__ Float8_ to_float(Float8_ u) { return u; }

 // Zero-out a variable.
-inline __device__ void zero(float& dst) {
-  dst = 0.f;
-}
+inline __device__ void zero(float& dst) { dst = 0.f; }

-} // namespace vllm
+}  // namespace vllm
--- a/csrc/attention/dtype_fp8.cuh
+++ b/csrc/attention/dtype_fp8.cuh
@@ -4,38 +4,38 @@

 #include <stdint.h>
 #ifdef ENABLE_FP8
-#ifndef USE_ROCM 
-#include <cuda_fp8.h>
-#endif // USE_ROCM
-#endif // ENABLE_FP8
+  #ifndef USE_ROCM
+    #include <cuda_fp8.h>
+  #endif  // USE_ROCM
+#endif    // ENABLE_FP8

 namespace vllm {

 enum class Fp8KVCacheDataType {
-    kAuto = 0,
-    kFp8E4M3 = 1,
-    kFp8E5M2 = 2,
+  kAuto = 0,
+  kFp8E4M3 = 1,
+  kFp8E5M2 = 2,
 };

 // fp8 vector types for quantization of kv cache
-template<>
+template <>
 struct Vec<uint8_t, 1> {
-    using Type = uint8_t;
+  using Type = uint8_t;
 };

-template<>
+template <>
 struct Vec<uint8_t, 2> {
-    using Type = uint16_t;
+  using Type = uint16_t;
 };

-template<>
+template <>
 struct Vec<uint8_t, 4> {
-    using Type = uint32_t;
+  using Type = uint32_t;
 };

-template<>
+template <>
 struct Vec<uint8_t, 8> {
-    using Type = uint2;
+  using Type = uint2;
 };

-} // namespace vllm
+}  // namespace vllm
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -5,36 +5,24 @@
 #include <map>
 #include <vector>

-void swap_blocks(
-  torch::Tensor& src,
-  torch::Tensor& dst,
-  const torch::Tensor& block_mapping);
+void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 const torch::Tensor& block_mapping);

-void copy_blocks(
-  std::vector<torch::Tensor>& key_caches,
-  std::vector<torch::Tensor>& value_caches,
-  const torch::Tensor& block_mapping);
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
+                 const torch::Tensor& block_mapping);

-void reshape_and_cache(
-  torch::Tensor& key,
-  torch::Tensor& value,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& slot_mapping,
-  const std::string& kv_cache_dtype,
-  const float kv_scale);
+void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
+                       torch::Tensor& key_cache, torch::Tensor& value_cache,
+                       torch::Tensor& slot_mapping,
+                       const std::string& kv_cache_dtype, const float kv_scale);

-void reshape_and_cache_flash(
-  torch::Tensor& key,
-  torch::Tensor& value,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& slot_mapping,
-  const std::string& kv_cache_dtype);
+void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
+                             torch::Tensor& key_cache,
+                             torch::Tensor& value_cache,
+                             torch::Tensor& slot_mapping,
+                             const std::string& kv_cache_dtype);

 // Just for unittest
-void convert_fp8(
-  torch::Tensor& dst_cache,
-  torch::Tensor& src_cache,
-  const float scale,
-  const std::string& kv_cache_dtype);
+void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
+                 const float scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
--- a/csrc/cpu/activation.cpp
+++ b/csrc/cpu/activation.cpp
 #include "cpu_types.hpp"

 namespace {
-template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8 &),
+template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8&),
          bool is_gated>
-void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
-                       scalar_t *__restrict__ output) {
+void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input,
+                       scalar_t* __restrict__ output) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();

@@ -34,13 +34,13 @@ void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
  }
 }

-FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) {
+FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8& x) {
  const vec_op::FP32Vec8 zeros(0.0);
  const vec_op::FP32Vec8 ones(1.0);
  return x / (ones + (zeros - x).exp());
 }

-FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
+FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(0.79788456f);
  const vec_op::FP32Vec8 w2(0.044715f);
@@ -50,7 +50,7 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
  return w3 * x * (ones + t);
 }

-FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
+FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(0.79788456f);
  const vec_op::FP32Vec8 w2(0.044715f);
@@ -59,14 +59,14 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
  return w3 * x * (ones + t);
 }

-FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) {
+FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(M_SQRT1_2);
  const vec_op::FP32Vec8 w2(0.5);
  return x * w2 * (ones + (x * w1).er());
 }

-FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
+FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
  const vec_op::FP32Vec8 w2(0.5);
@@ -75,40 +75,36 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
  const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
  return x * w2 * (ones + inner.tanh());
 }
-}; // namespace
+};  // namespace

-void silu_and_mul(torch::Tensor &out, torch::Tensor &input) {
+void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1) / 2;

-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "silu_and_mul_impl", [&] {
-        CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
-        activation_kernel<scalar_t, silu_act, true>(num_tokens, d,
-                                                    input.data_ptr<scalar_t>(),
-                                                    out.data_ptr<scalar_t>());
-        CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
-      });
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_impl", [&] {
+    CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
+    activation_kernel<scalar_t, silu_act, true>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
+  });
 }

-void gelu_and_mul(torch::Tensor &out,   // [..., d]
-                      torch::Tensor &input) // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1) / 2;

-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "gelu_and_mul_impl", [&] {
-        CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
-        activation_kernel<scalar_t, gelu_act, true>(num_tokens, d,
-                                                    input.data_ptr<scalar_t>(),
-                                                    out.data_ptr<scalar_t>());
-        CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
-      });
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul_impl", [&] {
+    CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
+    activation_kernel<scalar_t, gelu_act, true>(
+        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+    CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
+  });
 }

-void gelu_tanh_and_mul(torch::Tensor &out,   // [..., d]
-                           torch::Tensor &input) // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
 {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1) / 2;
@@ -123,7 +119,7 @@ void gelu_tanh_and_mul(torch::Tensor &out,   // [..., d]
      });
 }

-void gelu_new(torch::Tensor &out, torch::Tensor &input) {
+void gelu_new(torch::Tensor& out, torch::Tensor& input) {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1);

@@ -135,7 +131,7 @@ void gelu_new(torch::Tensor &out, torch::Tensor &input) {
  });
 }

-void gelu_fast(torch::Tensor &out, torch::Tensor &input) {
+void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
  int num_tokens = input.numel() / input.size(-1);
  int d = input.size(-1);


--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -5,25 +5,26 @@

 namespace {
 template <typename scalar_t>
-void copy_blocks_cpu_impl(
-    std::vector<torch::Tensor> &key_caches,
-    std::vector<torch::Tensor> &value_caches,
-    const torch::Tensor& mapping_pairs,
-    const int element_num_per_block, const int layer_num) {
+void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
+                          std::vector<torch::Tensor>& value_caches,
+                          const torch::Tensor& mapping_pairs,
+                          const int element_num_per_block,
+                          const int layer_num) {
  const size_t pair_num = mapping_pairs.size(0);
  const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
 #pragma omp parallel for collapse(2)
  for (int layer = 0; layer < layer_num; ++layer) {
    for (size_t pair = 0; pair < pair_num; ++pair) {
-      int64_t source_offset = element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
+      int64_t source_offset =
+          element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
      int64_t target_offset =
          element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
-      scalar_t *key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
-      scalar_t *source_ptr = key_cache_ptr + source_offset;
-      scalar_t *target_ptr = key_cache_ptr + target_offset;
+      scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
+      scalar_t* source_ptr = key_cache_ptr + source_offset;
+      scalar_t* target_ptr = key_cache_ptr + target_offset;
      std::memcpy(target_ptr, source_ptr, block_bytes);

-      scalar_t *value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
+      scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
      source_ptr = value_cache_ptr + source_offset;
      target_ptr = value_cache_ptr + target_offset;
      std::memcpy(target_ptr, source_ptr, block_bytes);
@@ -33,9 +34,9 @@ void copy_blocks_cpu_impl(

 template <typename scalar_t>
 void reshape_and_cache_cpu_impl(
-    const scalar_t *__restrict__ key, const scalar_t *__restrict__ value,
-    scalar_t *__restrict__ key_cache, scalar_t *__restrict__ value_cache,
-    const int64_t *__restrict__ slot_mapping, const int num_tokens,
+    const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+    scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+    const int64_t* __restrict__ slot_mapping, const int num_tokens,
    const int key_stride, const int value_stride, const int num_heads,
    const int head_size, const int block_size, const int x) {
  const int block_elem_num = num_heads * head_size * block_size;
@@ -48,14 +49,14 @@ void reshape_and_cache_cpu_impl(
        int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
        int src_value_head_idx =
            token_idx * value_stride + head_idx * head_size;
-        const scalar_t *src_key_head_ptr = key + src_key_head_idx;
-        const scalar_t *src_value_head_ptr = value + src_value_head_idx;
+        const scalar_t* src_key_head_ptr = key + src_key_head_idx;
+        const scalar_t* src_value_head_ptr = value + src_value_head_idx;
        const int64_t block_index = slot_idx / block_size;
        const int64_t block_offset = slot_idx % block_size;
-        scalar_t *target_key_head_ptr = key_cache +
+        scalar_t* target_key_head_ptr = key_cache +
                                        block_elem_num * block_index +
                                        head_idx * block_size * head_size;
-        scalar_t *target_value_head_ptr = value_cache +
+        scalar_t* target_value_head_ptr = value_cache +
                                          block_elem_num * block_index +
                                          head_idx * block_size * head_size;

@@ -79,10 +80,10 @@ void reshape_and_cache_cpu_impl(
    }
  }
 }
-}; // namespace
+};  // namespace

-void copy_blocks(std::vector<torch::Tensor> &key_caches,
-                 std::vector<torch::Tensor> &value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping) {
  unsigned num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@@ -100,10 +101,10 @@ void copy_blocks(std::vector<torch::Tensor> &key_caches,
      });
 }

-void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
-                       torch::Tensor &key_cache, torch::Tensor &value_cache,
-                       torch::Tensor &slot_mapping,
-                       const std::string &kv_cache_dtype, float kv_scale) {
+void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
+                       torch::Tensor& key_cache, torch::Tensor& value_cache,
+                       torch::Tensor& slot_mapping,
+                       const std::string& kv_cache_dtype, float kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);

  int num_tokens = key.size(0);
@@ -127,7 +128,7 @@ void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
      });
 }

-void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
-                 const torch::Tensor&block_mapping) {
+void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 const torch::Tensor& block_mapping) {
  TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
 }
--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@@ -2,10 +2,10 @@

 namespace {
 template <typename scalar_t>
-void rms_norm_impl(scalar_t *__restrict__ out,
-                       const scalar_t *__restrict__ input,
-                       const scalar_t *__restrict__ weight, const float epsilon,
-                       const int num_tokens, const int hidden_size) {
+void rms_norm_impl(scalar_t* __restrict__ out,
+                   const scalar_t* __restrict__ input,
+                   const scalar_t* __restrict__ weight, const float epsilon,
+                   const int num_tokens, const int hidden_size) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
@@ -41,11 +41,11 @@ void rms_norm_impl(scalar_t *__restrict__ out,
 }

 template <typename scalar_t>
-void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
-                                 scalar_t *__restrict__ residual,
-                                 const scalar_t *__restrict__ weight,
-                                 const float epsilon, const int num_tokens,
-                                 const int hidden_size) {
+void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
+                             scalar_t* __restrict__ residual,
+                             const scalar_t* __restrict__ weight,
+                             const float epsilon, const int num_tokens,
+                             const int hidden_size) {
  using scalar_vec_t = vec_op::vec_t<scalar_t>;
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
  TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
@@ -85,24 +85,24 @@ void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
    }
  }
 }
-} // namespace
+}  // namespace

-void rms_norm(torch::Tensor &out, torch::Tensor &input,
-                  torch::Tensor &weight, float epsilon) {
+void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+              float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
    CPU_KERNEL_GUARD_IN(rms_norm_impl)
    rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
-                      weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-                      hidden_size);
+                  weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+                  hidden_size);
    CPU_KERNEL_GUARD_OUT(rms_norm_impl)
  });
 }

-void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
-                            torch::Tensor &weight, float epsilon) {
+void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
+                        torch::Tensor& weight, float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;


--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -4,16 +4,16 @@
 namespace {
 template <typename scalar_t>
 void rotary_embedding_impl(
-    const int64_t
-        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
-    scalar_t
-        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
-                             /// [num_tokens, num_heads, head_size]
-    scalar_t
-        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
-                           // [num_tokens, num_kv_heads, head_size]
-    const scalar_t
-        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
+                                   /// head_size] or [num_tokens, num_heads,
+                                   /// head_size]
+    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
    const int num_heads, const int num_kv_heads, const int head_size,
    const int num_tokens) {
@@ -26,7 +26,7 @@ void rotary_embedding_impl(
 #pragma omp parallel for
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    int64_t pos = positions[token_idx];
-    const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;

    for (int i = 0; i < num_heads; ++i) {
      const int head_idx = i;
@@ -94,16 +94,16 @@ void rotary_embedding_impl(

 template <typename scalar_t>
 void rotary_embedding_gptj_impl(
-    const int64_t
-        *__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
-    scalar_t
-        *__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
-                             /// [num_tokens, num_heads, head_size]
-    scalar_t
-        *__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
-                           // [num_tokens, num_kv_heads, head_size]
-    const scalar_t
-        *__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
+    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
+                                            // [num_tokens]
+    scalar_t* __restrict__ query,           /// [batch_size, seq_len, num_heads,
+                                   /// head_size] or [num_tokens, num_heads,
+                                   /// head_size]
+    scalar_t* __restrict__ key,  // [batch_size, seq_len, num_kv_heads,
+                                 // head_size] or [num_tokens, num_kv_heads,
+                                 // head_size]
+    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
+                                                 // 2]
    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
    const int num_heads, const int num_kv_heads, const int head_size,
    const int num_tokens) {
@@ -113,13 +113,13 @@ void rotary_embedding_gptj_impl(
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    for (int i = 0; i < num_heads; ++i) {
      int64_t pos = positions[token_idx];
-      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
-      const scalar_t *cos_cache_ptr = cache_ptr;
-      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
+      const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+      const scalar_t* cos_cache_ptr = cache_ptr;
+      const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
      const int head_idx = i;
      const int64_t token_head =
          token_idx * query_stride + head_idx * head_size;
-      scalar_t *head_query = token_head + query;
+      scalar_t* head_query = token_head + query;
      for (int j = 0; j < embed_dim; j += 1) {
        const int rot_offset = j;
        const int x_index = 2 * rot_offset;
@@ -141,12 +141,12 @@ void rotary_embedding_gptj_impl(
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
    for (int i = 0; i < num_kv_heads; ++i) {
      int64_t pos = positions[token_idx];
-      const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
-      const scalar_t *cos_cache_ptr = cache_ptr;
-      const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
+      const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+      const scalar_t* cos_cache_ptr = cache_ptr;
+      const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
      const int head_idx = i;
      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-      scalar_t *head_key = key + token_head;
+      scalar_t* head_key = key + token_head;
      for (int j = 0; j < embed_dim; j += 1) {
        const int rot_offset = j;
        const int x_index = 2 * rot_offset;
@@ -164,11 +164,11 @@ void rotary_embedding_gptj_impl(
    }
  }
 }
-}; // namespace
+};  // namespace

-void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
-                          torch::Tensor &key, int head_size,
-                          torch::Tensor &cos_sin_cache, bool is_neox) {
+void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                      torch::Tensor& key, int head_size,
+                      torch::Tensor& cos_sin_cache, bool is_neox) {
  int num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(-1) / head_size;

--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
@@ -8,66 +8,37 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");

  // Attention ops
-  ops.def(
-    "paged_attention_v1",
-    &paged_attention_v1,
-    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
-  ops.def(
-    "paged_attention_v2",
-    &paged_attention_v2,
-    "PagedAttention V2.");
+  ops.def("paged_attention_v1", &paged_attention_v1,
+          "Compute the attention between an input query and the cached "
+          "keys/values using PagedAttention.");
+  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");

  // Activation ops
-  ops.def(
-    "silu_and_mul",
-    &silu_and_mul,
-    "Activation function used in SwiGLU.");
-  ops.def(
-    "gelu_and_mul",
-    &gelu_and_mul,
-    "Activation function used in GeGLU with `none` approximation.");
-  ops.def(
-    "gelu_tanh_and_mul",
-    &gelu_tanh_and_mul,
-    "Activation function used in GeGLU with `tanh` approximation.");
-  ops.def(
-    "gelu_new",
-    &gelu_new,
-    "GELU implementation used in GPT-2.");
-  ops.def(
-    "gelu_fast",
-    &gelu_fast,
-    "Approximate GELU implementation.");
+  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
+  ops.def("gelu_and_mul", &gelu_and_mul,
+          "Activation function used in GeGLU with `none` approximation.");
+  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
+          "Activation function used in GeGLU with `tanh` approximation.");
+  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
+  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");

  // Layernorm
-  ops.def(
-    "rms_norm",
-    &rms_norm,
-    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+  ops.def("rms_norm", &rms_norm,
+          "Apply Root Mean Square (RMS) Normalization to the input tensor.");

-  ops.def(
-    "fused_add_rms_norm",
-    &fused_add_rms_norm,
-    "In-place fused Add and RMS Normalization");
+  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
+          "In-place fused Add and RMS Normalization");

  // Rotary embedding
-  ops.def(
-    "rotary_embedding",
-    &rotary_embedding,
-    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+  ops.def("rotary_embedding", &rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");

  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def(
-    "swap_blocks",
-    &swap_blocks,
-    "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def(
-    "copy_blocks",
-    &copy_blocks,
-    "Copy the cache blocks from src to dst");
-  cache_ops.def(
-    "reshape_and_cache",
-    &reshape_and_cache,
-    "Reshape the key and value tensors and cache them");
+  cache_ops.def("swap_blocks", &swap_blocks,
+                "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def("copy_blocks", &copy_blocks,
+                "Copy the cache blocks from src to dst");
+  cache_ops.def("reshape_and_cache", &reshape_and_cache,
+                "Reshape the key and value tensors and cache them");
 }
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
 #pragma once

 #ifdef USE_ROCM
-#include <hip/hip_runtime.h>
+  #include <hip/hip_runtime.h>
 #endif

 #ifndef USE_ROCM
@@ -17,7 +17,8 @@
 #endif

 #ifndef USE_ROCM
-  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
 #else
  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
 #endif
@@ -29,7 +30,8 @@
 #endif

 #ifndef USE_ROCM
-  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down_sync(uint32_t(-1), var, lane_delta)
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
 #else
  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
 #endif
@@ -41,4 +43,3 @@
  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
 #endif
-
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -2,9 +2,6 @@

 #include <torch/extension.h>

-int get_device_attribute(
-    int attribute,
-    int device_id);
+int get_device_attribute(int attribute, int device_id);

-int get_max_shared_memory_per_block_device_attribute(
-    int device_id);
+int get_max_shared_memory_per_block_device_attribute(int device_id);