pybind.cpp 1.77 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
#include "cache.h"
#include "cuda_utils.h"
#include "ops.h"
#include <torch/extension.h>

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  // vLLM custom ops
  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");

  // Attention ops
11
12
13
14
  ops.def("paged_attention_v1", &paged_attention_v1,
          "Compute the attention between an input query and the cached "
          "keys/values using PagedAttention.");
  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
15
16

  // Activation ops
17
18
19
20
21
22
23
  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
  ops.def("gelu_and_mul", &gelu_and_mul,
          "Activation function used in GeGLU with `none` approximation.");
  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
          "Activation function used in GeGLU with `tanh` approximation.");
  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
24
25

  // Layernorm
26
27
  ops.def("rms_norm", &rms_norm,
          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
28

29
30
  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
          "In-place fused Add and RMS Normalization");
31
32

  // Rotary embedding
33
34
  ops.def("rotary_embedding", &rotary_embedding,
          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
35
36
37

  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
38
39
40
41
42
43
  cache_ops.def("swap_blocks", &swap_blocks,
                "Swap in (out) the cache blocks from src to dst");
  cache_ops.def("copy_blocks", &copy_blocks,
                "Copy the cache blocks from src to dst");
  cache_ops.def("reshape_and_cache", &reshape_and_cache,
                "Reshape the key and value tensors and cache them");
44
}