Merge tag 'v0.6.2' into v0.6.2-dev

539aa992 · zhuwenwen · 93872128 · 7193774b · 539aa992 · 539aa992
Commit 539aa992 authored Sep 27, 2024 by zhuwenwen
20 changed files
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@@ -71,7 +71,7 @@ torch::Tensor run_impl(PyTorchArguments args) {
  auto arguments = MacheteKernel::create_arguments(
      stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr,
      layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0),
-      args.group_size.value_or(K));
+      args.group_size);
  TORCH_CHECK(MacheteKernel::can_implement(arguments),
              "Machete kernel cannot be run with these arguments");

--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@@ -53,7 +53,7 @@ torch::Tensor prepack_impl(torch::Tensor const B) {
  // clang-format on
  // Allocate output
-  torch::Tensor D = torch::empty_like(B);
+  torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
  prepack_B<PrepackedLayoutB>(stream, B_ptr, layout_Bt,
                              static_cast<ElementB*>(D.mutable_data_ptr()));

--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
+#pragma once
+#include <torch/all.h>
+void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
+                     torch::Tensor& max_logits, torch::Tensor& tmp_out,
+                     torch::Tensor& query, torch::Tensor& key_cache,
+                     torch::Tensor& value_cache, int64_t num_kv_heads,
+                     double scale, torch::Tensor& block_tables,
+                     torch::Tensor& context_lens, int64_t block_size,
+                     int64_t max_context_len,
+                     const c10::optional<torch::Tensor>& alibi_slopes,
+                     const std::string& kv_cache_dtype, double k_scale,
+                     double v_scale);
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
+#include "core/registration.h"
+#include "rocm/ops.h"
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
+  // vLLM custom ops for rocm
+  // Custom attention op
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  rocm_ops.def(
+      "paged_attention(Tensor! out, Tensor exp_sums,"
+      "                Tensor max_logits, Tensor tmp_out,"
+      "                Tensor query, Tensor key_cache,"
+      "                Tensor value_cache, int num_kv_heads,"
+      "                float scale, Tensor block_tables,"
+      "                Tensor context_lens, int block_size,"
+      "                int max_context_len,"
+      "                Tensor? alibi_slopes,"
+      "                str kv_cache_dtype,"
+      "                float k_scale, float v_scale) -> ()");
+  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::
   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
+   Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
-   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
+   ``export VLLM_RPC_TIMEOUT=1800000``
 Example commands and usage:
 ===========================

--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
--- a/docs/source/quantization/gguf.rst
+++ b/docs/source/quantization/gguf.rst
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
--- a/examples/offline_chat_with_tools.py
+++ b/examples/offline_chat_with_tools.py