Commit 539aa992 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.2' into v0.6.2-dev

parents 93872128 7193774b
...@@ -71,7 +71,7 @@ torch::Tensor run_impl(PyTorchArguments args) { ...@@ -71,7 +71,7 @@ torch::Tensor run_impl(PyTorchArguments args) {
auto arguments = MacheteKernel::create_arguments( auto arguments = MacheteKernel::create_arguments(
stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr, stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr,
layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0), layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0),
args.group_size.value_or(K)); args.group_size);
TORCH_CHECK(MacheteKernel::can_implement(arguments), TORCH_CHECK(MacheteKernel::can_implement(arguments),
"Machete kernel cannot be run with these arguments"); "Machete kernel cannot be run with these arguments");
......
...@@ -53,7 +53,7 @@ torch::Tensor prepack_impl(torch::Tensor const B) { ...@@ -53,7 +53,7 @@ torch::Tensor prepack_impl(torch::Tensor const B) {
// clang-format on // clang-format on
// Allocate output // Allocate output
torch::Tensor D = torch::empty_like(B); torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);
prepack_B<PrepackedLayoutB>(stream, B_ptr, layout_Bt, prepack_B<PrepackedLayoutB>(stream, B_ptr, layout_Bt,
static_cast<ElementB*>(D.mutable_data_ptr())); static_cast<ElementB*>(D.mutable_data_ptr()));
......
This diff is collapsed.
#pragma once
#include <torch/all.h>
void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
torch::Tensor& max_logits, torch::Tensor& tmp_out,
torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads,
double scale, torch::Tensor& block_tables,
torch::Tensor& context_lens, int64_t block_size,
int64_t max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale,
double v_scale);
#include "core/registration.h"
#include "rocm/ops.h"
// Note on op signatures:
// The X_meta signatures are for the meta functions corresponding to op X.
// They must be kept in sync with the signature for X. Generally, only
// functions that return Tensors require a meta function.
//
// See the following links for detailed docs on op registration and function
// schemas.
// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
// vLLM custom ops for rocm
// Custom attention op
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
rocm_ops.def(
"paged_attention(Tensor! out, Tensor exp_sums,"
" Tensor max_logits, Tensor tmp_out,"
" Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads,"
" float scale, Tensor block_tables,"
" Tensor context_lens, int block_size,"
" int max_context_len,"
" Tensor? alibi_slopes,"
" str kv_cache_dtype,"
" float k_scale, float v_scale) -> ()");
rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
}
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
This diff is collapsed.
...@@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/. ...@@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/.
.. tip:: .. tip::
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes. Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000`` ``export VLLM_RPC_TIMEOUT=1800000``
Example commands and usage: Example commands and usage:
=========================== ===========================
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment