Merge tag 'v0.6.1' into v0.6.1-dev

4851c202 · zhuwenwen · 9b902f9e · 3fd2b0d2 · 4851c202 · 4851c202
Commit 4851c202 authored Sep 13, 2024 by zhuwenwen
20 changed files
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,8 +11,9 @@ import uvloop
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
 from vllm.inputs import PromptInputs
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -504,13 +505,11 @@ if __name__ == "__main__":
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
-    parser.add_argument(
+    parser.add_argument("--device",
-        "--device",
                        type=str,
                        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+                        choices=DEVICE_OPTIONS,
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+                        help='device type for vLLM execution')
-        'CPU.')
    parser.add_argument(
        "--num-scheduler-steps",
        type=int,

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CXX_STANDARD 17)
 #
 # Define environment variables for special configurations
@@ -83,12 +84,7 @@ endif()
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
-list(APPEND LIBS "numa")
+list(APPEND LIBS dnnl numa)
-#
-# Define extension targets
-#
 #
 # _C extension
@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
    "csrc/cpu/pos_encoding.cpp"
    "csrc/cpu/torch_bindings.cpp")
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+#
+# Define extension targets
+#
 define_gpu_extension_target(
    _C
    DESTINATION vllm

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -355,6 +355,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})
+  # TODO: is torch_python_LIBRARY needed?
  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
    ${GPU_LIBRARIES})

--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -24,8 +24,8 @@ namespace vec_op {
 #define CPU_KERNEL_GUARD_OUT(NAME)
 #else
 #define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
+  RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 #define FORCE_INLINE __attribute__((always_inline)) inline
@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const FP32Vec16 &);
  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+  void save(void* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm256_mask_storeu_epi16(ptr, mask, reg);
+  }
 };
 #ifdef __AVX512F__
@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    return FP32Vec16(_mm512_div_ps(reg, b.reg));
  }
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
+  }
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(_mm512_max_ps(reg, b.reg));
+  }
+  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
+  }
+  FP32Vec16 abs() const {
+    return FP32Vec16(_mm512_abs_ps(reg));
+  } 
  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+  float reduce_max() const { return _mm512_reduce_max_ps(reg); }
  template <int group_size> float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  }
  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+  void save(float* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm512_mask_storeu_ps(ptr, mask, reg);
+  }
 };
 #else
 struct FP32Vec16 : public Vec<FP32Vec16> {
@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 };
 #endif
+#ifdef __AVX512F__
+struct INT8Vec16: public Vec<INT8Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m128i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+  __m128i reg;
+  explicit INT8Vec16(const FP32Vec16& vec) : reg(
+    _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
+  ) {}
+  void save(int8_t* ptr) const {
+    _mm_storeu_epi8(ptr, reg);
+  }
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm_mask_storeu_epi8(ptr, mask, reg);
+  }
+};
+#endif
 template <typename T> struct VecType { using vec_type = void; };
 template <typename T> using vec_t = typename VecType<T>::vec_type;

--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
+#ifndef DNNL_HELPER_HPP
+#define DNNL_HELPER_HPP
+#include <c10/util/BFloat16.h>
+#include "oneapi/dnnl/dnnl.hpp"
+namespace {
+template <typename T>
+struct DNNLType {
+  static constexpr dnnl::memory::data_type type =
+      dnnl::memory::data_type::undef;
+};
+template <>
+struct DNNLType<int8_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
+};
+template <>
+struct DNNLType<int32_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
+};
+template <>
+struct DNNLType<float> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
+};
+template <>
+struct DNNLType<c10::BFloat16> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
+};
+template <typename T>
+constexpr inline dnnl::memory::data_type get_dnnl_type() {
+  return DNNLType<std::decay_t<T>>::type;
+}
+};  // namespace
+template <bool InputNoScale>
+class DNNLPrimitiveHelper {
+ public:
+  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
+  // A: [M, K], row-major
+  // B: [K, N], column-major
+  // C: [M, N], row-major
+  // bias: [N], row-major, optional
+  // a_scales: [MS]
+  // b_scales: [NS]
+  // Note: Due to the limitation of oneDNN
+  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
+  // not supported.
+  template <typename OutputT, typename BiasT>
+  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
+                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
+                            dnnl_dim_t K, const float* a_scales,
+                            const float* b_scales, dnnl_dim_t MS,
+                            dnnl_dim_t NS) {
+    auto&& OutputType = get_dnnl_type<OutputT>();
+    auto&& BiasType = get_dnnl_type<BiasT>();
+    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
+    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
+    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
+    dnnl::primitive_attr attr;
+    if constexpr (!InputNoScale) {
+      if (MS == 1) {
+        // per-tensor
+        attr.set_scales_mask(DNNL_ARG_SRC, 0);
+      } else {
+        // per-token
+        TORCH_CHECK(false, "per-token quantization is unsupported.");
+      }
+    }
+    if (NS == 1) {
+      // per-tensor
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+    } else {
+      // per-channel
+      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
+    }
+    dnnl::matmul::primitive_desc matmul_pd;
+    if (bias) {
+      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               bias_md, c_md, attr);
+    } else {
+      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
+                                               c_md, attr);
+    }
+    dnnl::matmul matmul(matmul_pd);
+    auto& engine = default_engine();
+    dnnl::memory a_m(a_md, engine, (void*)a);
+    dnnl::memory b_m(b_md, engine, (void*)b);
+    dnnl::memory c_m(c_md, engine, (void*)c);
+    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)a_scales);
+    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
+                            (void*)b_scales);
+    auto& stream = default_stream();
+    if constexpr (InputNoScale) {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    } else {
+      if (bias) {
+        dnnl::memory::desc bias_md({N}, BiasType, {1});
+        dnnl::memory bias_m(bias_md, engine, (void*)bias);
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_BIAS, bias_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      } else {
+        matmul.execute(
+            stream, {
+                        {DNNL_ARG_SRC, a_m},
+                        {DNNL_ARG_WEIGHTS, b_m},
+                        {DNNL_ARG_DST, c_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
+                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
+                    });
+      }
+    }
+    stream.wait();
+  }
+ private:
+  static dnnl::engine& default_engine() {
+    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
+    return engine;
+  }
+  static dnnl::stream& default_stream() {
+    static dnnl::stream stream(default_engine());
+    return stream;
+  }
+};
+#endif
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
+#include "cpu_types.hpp"
+#include "dnnl_helper.hpp"
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using load_vec_type = void;
+  using cvt_vec_type = void;
+};
+template <>
+struct KernelVecType<float> {
+  using load_vec_type = vec_op::FP32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using load_vec_type = vec_op::BF16Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+#ifdef __AVX512F__
+template <typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int num_tokens,
+                                   const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output + i * hidden_size + j);
+    }
+    load_vec_t elems(input + i * hidden_size + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+    if (j + vec_elem_num == hidden_size) {
+      elems_int8.save(output + i * hidden_size + j);
+    } else {
+      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+template <typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, const int num_tokens,
+                                    const int hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_abs(0.0);
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        max_abs = max_abs.max(elems_fp32.abs());
+      }
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      if (j + vec_elem_num == hidden_size) {
+        max_abs = max_abs.max(elems_fp32.abs());
+      } else {
+        max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
+      }
+    }
+    float scale_val = max_abs.reduce_max() / 127.0f;
+    scale[i] = scale_val;
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+    {
+      int j = 0;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input + i * hidden_size + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output + i * hidden_size + j);
+      }
+      load_vec_t elems(input + i * hidden_size + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      if (j + vec_elem_num == hidden_size) {
+        elems_int8.save(output + i * hidden_size + j);
+      } else {
+        elems_int8.save(output + i * hidden_size + j, hidden_size - j);
+      }
+    }
+  }
+}
+template <bool Bias, typename scalar_t>
+void dynamic_output_scale_impl(const float* input, scalar_t* output,
+                               const float* scale, const scalar_t* bias,
+                               const int num_tokens, const int hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+  #pragma omp parallel for
+  for (int i = 0; i < num_tokens; ++i) {
+    int j = 0;
+    cvt_vec_t token_scale_vec(scale[i]);
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      cvt_vec_t elems_fp32(input + i * hidden_size + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output + i * hidden_size + j);
+    }
+    cvt_vec_t elems_fp32(input + i * hidden_size + j);
+    elems_fp32 = elems_fp32 * token_scale_vec;
+    if constexpr (Bias) {
+      load_vec_t bias_vec(bias + j);
+      cvt_vec_t bias_vec_fp32(bias_vec);
+      elems_fp32 = elems_fp32 + bias_vec_fp32;
+    }
+    load_vec_t elems_out(elems_fp32);
+    if (j + vec_elem_num == hidden_size) {
+      elems_out.save(output + i * hidden_size + j);
+    } else {
+      elems_out.save(output + i * hidden_size + j, hidden_size - j);
+    }
+  }
+}
+#else
+template <typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int num_tokens,
+                                   const int hidden_size) {
+  TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
+}
+template <typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, const int num_tokens,
+                                    const int hidden_size) {
+  TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
+}
+template <typename scalar_t>
+void dynamic_output_scale_impl() {
+  TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
+}
+#endif
+}  // namespace
+void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
+                    const torch::Tensor& a,         // [M, IC], row-major
+                    const torch::Tensor& b,         // [IC, OC], column-major
+                    const torch::Tensor& a_scales,  // [1] or [M]
+                    const torch::Tensor& b_scales,  // [1] or [OC]
+                    const c10::optional<torch::Tensor>& bias  // [OC]
+) {
+  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
+              "int8_scaled_mm only supports INT8 inputs.")
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
+    if (a_scales.numel() != 1) {
+      // per-token
+      // Note: oneDNN doesn't support per-token activation quantization
+      torch::Tensor tmp_fp32_out =
+          torch::empty_like(c, ::at::ScalarType::Float);
+      DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
+          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
+          tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
+          a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
+          b_scales.numel());
+      if (bias.has_value()) {
+        dynamic_output_scale_impl<true>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
+            c.size(1));
+      } else {
+        dynamic_output_scale_impl<false>(
+            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+            a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
+      }
+    } else {
+      // per-tensor
+      if (bias.has_value()) {
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      } else {
+        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
+            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
+            (void*)(0), a.size(0), b.size(1), a.size(1),
+            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
+            a_scales.numel(), b_scales.numel());
+      }
+    }
+  });
+}
+// static-per-tensor quantization.
+void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
+                              const torch::Tensor& input,  // [..., hidden_size]
+                              const torch::Tensor& scale) {
+  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scale.numel() == 1);
+  const int hidden_size = input.size(-1);
+  const int num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
+        static_scaled_int8_quant_impl(
+            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+            scale.data_ptr<float>(), num_tokens, hidden_size);
+      });
+}
+// dynamic-per-token quantization.
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    const torch::Tensor& input,  // [..., hidden_size]
+    torch::Tensor& scale         // [..., 1]
+) {
+  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
+        dynamic_scaled_int8_quant_impl(
+            input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+            scale.data_ptr<float>(), num_tokens, hidden_size);
+      });
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,12 @@
 #include <torch/library.h>
-void init_cpu_threads_env(const std::string& cpu_ids);
+std::string init_cpu_threads_env(const std::string& cpu_ids);
+void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+                    const torch::Tensor& b, const torch::Tensor& a_scales,
+                    const torch::Tensor& b_scales,
+                    const c10::optional<torch::Tensor>& bias);
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops
@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor! key, int head_size,"
      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+  // Quantization
+#ifdef __AVX512F__
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
+      "()");
+  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
+           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
+#endif
 }
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Copy the cache blocks from src to dst.
  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "block_mapping) -> ()");
+      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
  // Reshape the key and value tensors and cache them.
@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
  // CPU utils
-  utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
+  utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -5,7 +5,7 @@
 #include "cpu_types.hpp"
-void init_cpu_threads_env(const std::string& cpu_ids) {
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
  torch::set_num_threads((int)omp_cpu_ids.size());
  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
+  std::vector<std::pair<int, int>> thread_core_mapping;
+  thread_core_mapping.reserve(omp_cpu_ids.size());
+  omp_lock_t writelock;
+  omp_init_lock(&writelock);
 #pragma omp parallel for schedule(static, 1)
  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
-    cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
+    cpu_set_t mask;
-    size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
+    CPU_ZERO(&mask);
-    CPU_ZERO_S(size, mask);
+    CPU_SET(omp_cpu_ids[i], &mask);
-    CPU_SET_S(omp_cpu_ids[i], size, mask);
+    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
-    sched_setaffinity(0, sizeof(cpu_set_t), mask);
+    if (ret == -1) {
-    CPU_FREE(mask);
+      TORCH_CHECK(false,
+                  "sched_setaffinity failed. errno: " + std::to_string(errno));
+    }
+    omp_set_lock(&writelock);
+    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
+    omp_unset_lock(&writelock);
  }
+  omp_destroy_lock(&writelock);
  numa_free_nodemask(omp_cpu_mask);
+  std::stringstream ss;
+  ss << "OMP threads binding of Process " << getpid() << ":\n";
+  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
+            [](auto&& a, auto&& b) { return a.second < b.second; });
+  for (auto&& item : thread_core_mapping) {
+    ss << "\t"
+       << "OMP tid: " << item.first << ", core " << item.second << "\n";
+  }
+  return ss.str();
 }
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
      "bool replicate_input, bool apply_weights) -> Tensor");
  m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }

--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -165,9 +165,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits);
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                int64_t size_n, int64_t num_bits);
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits);
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
                              int64_t n);
@@ -212,9 +220,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor& scales);
-void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
-                     torch::Tensor lookup_table);
 // torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 //                         torch::Tensor b_gptq_qzeros,
 //                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,

--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 }
 #endif
+torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                     c10::SymInt size_k, c10::SymInt size_n,
+                                     int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 }
 #endif
+torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
+                                      torch::Tensor& perm, c10::SymInt size_k,
+                                      c10::SymInt size_n, int64_t num_bits) {
+  int const pack_factor = 32 / num_bits;
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  return torch::empty_symint(
+      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
+      options);
+}
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
-#include <torch/all.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-// half-tensor
-#include <c10/cuda/CUDAStream.h>
-#include <ATen/cuda/CUDATensorMethods.cuh>
-#include <c10/cuda/CUDAGuard.h>
-#define BLOCKWIDTH 128
-#define BLOCKHEIGHT4 16
-namespace vllm {
-namespace squeezellm {
-__device__ inline unsigned int as_unsigned(int i) {
-  return *reinterpret_cast<unsigned int*>(&i);
-}
-// 4-bit matvec kernel (LUT-based)
-__global__ void NUQ4MatMulKernel(
-#ifndef USE_ROCM
-    const half2* __restrict__ vec,
-#else
-    const __half2* __restrict__ vec,
-#endif
-    const int* __restrict__ mat,
-#ifndef USE_ROCM
-    half2* __restrict__ mul,
-#else
-    float2* __restrict__ mul,
-#endif
-    const __half* __restrict__ lookup_table, int height, int width, int batch,
-    int vec_height) {
-  const int blockwidth2 = BLOCKWIDTH / 2;
-  int row = BLOCKHEIGHT4 * blockIdx.x;
-  int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
-#ifndef USE_ROCM
-  __shared__ half2 blockvec[blockwidth2];
-#else
-  __shared__ __half2 blockvec[blockwidth2];
-#endif
-  __shared__ __half deq2[16][BLOCKWIDTH];
-  int off = threadIdx.x;
-  int column_offset = col * 16;
-  for (int val = 0; val < 16; val += 1) {
-    int lut_index = column_offset + val;
-    deq2[val][off] = lookup_table[lut_index];
-  }
-  __half res;
-#ifndef USE_ROCM
-  half2 res2;
-  half2 tmp2;
-#else
-  __half2 res2;
-  __half2 tmp2;
-#endif
-  int i;
-  int k;
-  unsigned int tmp1;
-  unsigned int lut_index1, lut_index2;
-  for (int b = 0; b < batch; ++b) {
-    i = width * row + col;
-    res = __int2half_rd(0);
-    k = 0;
-    __syncthreads();
-    if (threadIdx.x < blockwidth2)
-      blockvec[threadIdx.x] =
-          vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 +
-              threadIdx.x];
-    __syncthreads();
-    while (k < blockwidth2) {
-      tmp1 = as_unsigned(mat[i]);
-#ifndef USE_ROCM
-      res2 = {};
-      tmp2 = {};
-#else
-      res2.x = __half_as_ushort(__float2half(0));
-      res2.y = __half_as_ushort(__float2half(0));
-      tmp2.x = __half_as_ushort(__float2half(0));
-      tmp2.y = __half_as_ushort(__float2half(0));
-#endif
-      lut_index1 = tmp1 & 0xF;
-      lut_index2 = (tmp1 >> 4) & 0xF;
-#ifndef USE_ROCM
-      tmp2.x = deq2[lut_index1][off];
-      tmp2.y = deq2[lut_index2][off];
-#else
-      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
-      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
-#endif
-      res2 = __hfma2(tmp2, blockvec[k + 0], res2);
-      lut_index1 = (tmp1 >> 8) & 0xF;
-      lut_index2 = (tmp1 >> 12) & 0xF;
-#ifndef USE_ROCM
-      tmp2.x = deq2[lut_index1][off];
-      tmp2.y = deq2[lut_index2][off];
-#else
-      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
-      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
-#endif
-      res2 = __hfma2(tmp2, blockvec[k + 1], res2);
-      lut_index1 = (tmp1 >> 16) & 0xF;
-      lut_index2 = (tmp1 >> 20) & 0xF;
-#ifndef USE_ROCM
-      tmp2.x = deq2[lut_index1][off];
-      tmp2.y = deq2[lut_index2][off];
-#else
-      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
-      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
-#endif
-      res2 = __hfma2(tmp2, blockvec[k + 2], res2);
-      lut_index1 = (tmp1 >> 24) & 0xF;
-      lut_index2 = (tmp1 >> 28) & 0xF;
-#ifndef USE_ROCM
-      tmp2.x = deq2[lut_index1][off];
-      tmp2.y = deq2[lut_index2][off];
-#else
-      tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
-      tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
-#endif
-      res2 = __hfma2(tmp2, blockvec[k + 3], res2);
-#ifndef USE_ROCM
-      res = __hadd(__hadd(res2.x, res2.y), res);
-#else
-      res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)),
-                   res);
-#endif
-      i += width;
-      k += 4;
-    }
-    // col%2 -> only set one of the two values
-#ifndef USE_ROCM
-    half2 res3 = {};
-    if (col % 2 == 0) {
-      res3.x = res;
-    } else {
-      res3.y = res;
-    }
-#else
-    __half2 res3;
-    res3.x = __half_as_ushort(__float2half(0));
-    res3.y = __half_as_ushort(__float2half(0));
-    if (col % 2 == 0) {
-      res3.x = __half_as_ushort(res);
-    } else {
-      res3.y = __half_as_ushort(res);
-    }
-#endif
-#ifndef USE_ROCM
-    atomicAdd(&mul[b * width / 2 + col / 2], res3);
-#else
-    int tmp_addr = b * width / 2 + col / 2;
-    atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
-    atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
-#endif
-  }
-}
-}  // namespace squeezellm
-}  // namespace vllm
-// 4-bit matvec kernel (LUT-based)
-void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
-                     torch::Tensor lookup_table) {
-  int height = mat.size(0);
-  int width = mat.size(1);
-  int batch = vec.size(0);
-  int vec_height = vec.size(1);
-  dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
-              (width + BLOCKWIDTH - 1) / BLOCKWIDTH);
-  dim3 threads(BLOCKWIDTH);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
-#ifndef USE_ROCM
-      (half2*)vec.data_ptr<at::Half>(),
-#else
-      (__half2*)vec.data_ptr<at::Half>(),
-#endif
-      mat.data_ptr<int>(),
-#ifndef USE_ROCM
-      (half2*)mul.data_ptr<at::Half>(),
-      (__half*)lookup_table.data_ptr<at::Half>(),
-#else
-      (float2*)mul.data_ptr<float>(),
-      (__half*)lookup_table.data_ptr<at::Half>(),
-#endif
-      height, width, batch, vec_height);
-}
-#undef BLOCKWIDTH
-#undef BLOCKHEIGHT4
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // PagedAttention V2.
  ops.def(
      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
@@ -113,7 +113,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
  // prepare_inputs advance_step
-  ops.def("advance_step", &advance_step);
+  ops.def(
+      "advance_step(int num_seqs, int num_queries, int block_size, "
+      "Tensor! input_tokens, Tensor sampled_token_ids, "
+      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
+      "Tensor block_tables) -> ()");
  ops.impl("advance_step", torch::kCUDA, &advance_step);
  // Layernorm
@@ -175,27 +179,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Quantization ops
 #ifndef USE_ROCM
  // Quantized GEMM for AQLM.
-  ops.def("aqlm_gemm", &aqlm_gemm);
+  ops.def(
+      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
+      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
+      "-> Tensor");
  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
  // Decompression method for AQLM.
-  ops.def("aqlm_dequant", &aqlm_dequant);
+  ops.def(
+      "aqlm_dequant(Tensor codes, Tensor codebooks, "
+      "int[] codebook_partition_sizes) -> Tensor");
  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
  // Quantized GEMM for AWQ.
-  ops.def("awq_gemm", &awq_gemm);
+  ops.def(
+      "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
+      "Tensor _zeros, int split_k_iters) -> Tensor");
  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
  // Dequantization for AWQ.
-  ops.def("awq_dequantize", &awq_dequantize);
+  ops.def(
+      "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
+      "Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
+  // Note about marlin kernel 'workspace' arguments:
+  // Technically these should be mutable since they are modified by the kernel.
+  // But since they are set back to zero once the kernel is finished we can
+  // hand wave and say that they have no net effect.
+  //
+  // The reason to mark 'workspace' as immutable is so that they don't interfere
+  // with using ScalarType arguments in the ops. If they are marked as mutable,
+  // pytorch throws an assert in
+  // 'torch._higher_order_ops._register_effectful_op' that prevents these
+  // kernels from being torch.compile'd.
+  // See the following document for more info on custom types and ops that use
+  // custom types:
+  // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
-  ops.def("marlin_gemm", &marlin_gemm);
+  ops.def(
+      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+  ops.def(
+      "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
+      "Tensor b_scales, Tensor workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
+      "int size_m, int size_n, int size_k) -> Tensor");
  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
  // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -214,35 +247,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
  // gptq_marlin Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
+  ops.def(
+      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, "
+      "int size_m, int size_n, int size_k, bool is_k_full, "
+      "bool has_zp, bool use_fp32_reduce) -> Tensor");
  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
  // gptq_marlin repack from GPTQ.
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
+  ops.def(
+      "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
+      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+  ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
  // awq_marlin repack from AWQ.
-  ops.def("awq_marlin_repack", &awq_marlin_repack);
+  ops.def(
+      "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
+      "SymInt size_n, int num_bits) -> Tensor");
  ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
+  ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
  // Dequantization for GGML.
-  ops.def("ggml_dequantize", &ggml_dequantize);
+  ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
  // mmvq kernel for GGML.
-  ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
+  ops.def(
+      "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
+      "-> Tensor");
  ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
  // mmq kernel for GGML.
-  ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
+  ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
  ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
-  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.def(
+      "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
+      "Tensor! workspace, int num_bits, int size_m, int size_n, "
+      "int size_k) -> Tensor");
  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
  // marlin_qqq_gemm for QQQ.
-  ops.def("marlin_qqq_gemm", &marlin_qqq_gemm);
+  ops.def(
+      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
+      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
+      "Tensor! workspace, int size_m, int size_n, "
+      "int size_k) -> Tensor");
  ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -264,16 +317,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Check if cutlass scaled_mm is supported for CUDA devices of the given
  // capability
-  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
-  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
+  ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
-           &cutlass_scaled_mm_supports_fp8);
  // Mamba selective scan kernel
  ops.def(
      "selective_scan_fwd(Tensor! u, Tensor! delta,"
      "Tensor! A, Tensor! B, Tensor! C,"
      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
      "bool delta_softplus,"
-      "Tensor? index_, Tensor? x) -> Tensor[]");
+      "Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
  ops.def(
@@ -295,19 +348,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
  // Quantized GEMM for GPTQ.
-//   ops.def("gptq_gemm", &gptq_gemm);
+  // Note: even though the C++ inferred schema is correct for this op, it seems
+  // to prevent the meta function registry.
+//   ops.def(
+//       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
+//       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
+//       "-> Tensor");
 //   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
  // Post processing for GPTQ.
 //   ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
 //   ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
-  // Quantized GEMM for SqueezeLLM.
-  ops.def(
-      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
-      "lookup_table) -> ()");
-  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
  // Compute FP8 quantized tensor for given scaling factor.
 //   ops.def(
 //       "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
@@ -322,8 +374,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
 //   ops.def(
-//       "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
+//       "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
-//       "scale, Tensor? scale_ub) -> "
+//       "Tensor! scale, Tensor? scale_ub) -> "
 //       "()");
 //   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
 //            &dynamic_per_token_scaled_fp8_quant);
@@ -360,8 +412,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Copy the cache blocks from src to dst.
  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "block_mapping) -> ()");
+      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
  // Reshape the key and value tensors and cache them.
@@ -386,8 +438,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
-      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
-      "kv_cache_dtype) -> ()");
+      "str kv_cache_dtype) -> ()");
  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 }
@@ -395,24 +447,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
  // Cuda utils
  // Gets the specified device attribute.
-  cuda_utils.def("get_device_attribute", &get_device_attribute);
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
-  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
+  cuda_utils.impl("get_device_attribute", &get_device_attribute);
  // Gets the maximum shared memory per block device attribute.
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+  cuda_utils.def(
-                 &get_max_shared_memory_per_block_device_attribute);
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
-                  torch::kCUDA,
                  &get_max_shared_memory_per_block_device_attribute);
 }
 #ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
-  custom_ar.def("init_custom_ar", &init_custom_ar);
+  custom_ar.def(
+      "init_custom_ar(Tensor meta, Tensor rank_data, "
+      "str[] handles, int[] offsets, int rank, "
+      "bool full_nvlink) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-  custom_ar.def("should_custom_ar", &should_custom_ar);
+  custom_ar.def(
+      "should_custom_ar(Tensor inp, int max_size, int world_size, "
+      "bool full_nvlink) -> bool");
  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
@@ -424,21 +480,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
  custom_ar.def("dispose", &dispose);
-  custom_ar.impl("dispose", torch::kCPU, &dispose);
  custom_ar.def("meta_size", &meta_size);
-  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
-  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.def(
+      "register_buffer(int fa, Tensor t, str[] handles, "
+      "int[] offsets) -> ()");
  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
-  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
-                 &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
-  custom_ar.impl("register_graph_buffers", torch::kCPU,
-                 &register_graph_buffers);
 }
 #endif

--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -11,6 +11,5 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 mistral_common >= 1.3.4
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,6 +99,7 @@ autodoc_mock_imports = [
    "aiohttp",
    "compressed_tensors",
    "cpuinfo",
+    "cv2",
    "torch",
    "transformers",
    "psutil",

--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@@ -18,13 +18,27 @@ Traces can be visualized using https://ui.perfetto.dev/.
   Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-Example commands:
+.. tip::
+   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
+   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
+Example commands and usage:
+===========================
+Offline Inference:
+------------------
+Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
 OpenAI Server:
+--------------
 .. code-block:: bash
-    VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
+    VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B 
 benchmark_serving.py: