Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
304e2bab
Commit
304e2bab
authored
Oct 17, 2024
by
zhuwenwen
Browse files
update pa tc interfaces
parent
1c2aa04c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
10 deletions
+10
-10
csrc/ops.h
csrc/ops.h
+4
-4
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+6
-6
No files found.
csrc/ops.h
View file @
304e2bab
...
@@ -52,8 +52,8 @@ void paged_attention_v1_opt_tc(
...
@@ -52,8 +52,8 @@ void paged_attention_v1_opt_tc(
torch
::
Tensor
&
value_cache
,
int64_t
num_kv_heads
,
double
scale
,
torch
::
Tensor
&
value_cache
,
int64_t
num_kv_heads
,
double
scale
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int64_t
block_size
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int64_t
block_size
,
int64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
int64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
double
k
v
_scale
,
const
int64_t
tp_rank
,
const
std
::
string
&
kv_cache_dtype
,
double
k_scale
,
double
v_scale
,
const
int64_t
blocksparse_local_blocks
,
const
int64_t
tp_rank
,
const
int64_t
blocksparse_local_blocks
,
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_head_sliding_step
);
const
int64_t
blocksparse_head_sliding_step
);
...
@@ -63,8 +63,8 @@ void paged_attention_v2_opt_tc(
...
@@ -63,8 +63,8 @@ void paged_attention_v2_opt_tc(
torch
::
Tensor
&
value_cache
,
int64_t
num_kv_heads
,
double
scale
,
torch
::
Tensor
&
value_cache
,
int64_t
num_kv_heads
,
double
scale
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int64_t
block_size
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int64_t
block_size
,
int64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
int64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
double
k
v
_scale
,
const
int64_t
tp_rank
,
const
std
::
string
&
kv_cache_dtype
,
double
k_scale
,
double
v_scale
,
const
int64_t
blocksparse_local_blocks
,
const
int64_t
tp_rank
,
const
int64_t
blocksparse_local_blocks
,
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_head_sliding_step
);
const
int64_t
blocksparse_head_sliding_step
);
...
...
csrc/torch_bindings.cpp
View file @
304e2bab
...
@@ -76,20 +76,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
...
@@ -76,20 +76,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops
.
impl
(
"paged_attention_v2_opt"
,
torch
::
kCUDA
,
&
paged_attention_v2_opt
);
ops
.
impl
(
"paged_attention_v2_opt"
,
torch
::
kCUDA
,
&
paged_attention_v2_opt
);
// Compute the attention between an input query and the cached
// Compute the attention between an input query and the cached
// keys/values using PagedAttention. (opt
_tc
)
// keys/values using PagedAttention. (opt)
ops
.
def
(
ops
.
def
(
"paged_attention_v1_opt_tc("
"paged_attention_v1_opt_tc("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float k
v
_scale,
int tp_rank
,"
" str kv_cache_dtype, float k_scale,
float v_scale
,"
" int blocksparse_local_blocks,"
" int
tp_rank, int
blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"
);
" int blocksparse_head_sliding_step) -> ()"
);
ops
.
impl
(
"paged_attention_v1_opt_tc"
,
torch
::
kCUDA
,
&
paged_attention_v1_opt_tc
);
ops
.
impl
(
"paged_attention_v1_opt_tc"
,
torch
::
kCUDA
,
&
paged_attention_v1_opt_tc
);
// PagedAttention V2 (opt
_tc
).
// PagedAttention V2 (opt).
ops
.
def
(
ops
.
def
(
"paged_attention_v2_opt_tc("
"paged_attention_v2_opt_tc("
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
...
@@ -97,8 +97,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
...
@@ -97,8 +97,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float k
v
_scale,
int tp_rank
,"
" str kv_cache_dtype, float k_scale,
float v_scale
,"
" int blocksparse_local_blocks,"
" int
tp_rank, int
blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"
);
" int blocksparse_head_sliding_step) -> ()"
);
ops
.
impl
(
"paged_attention_v2_opt_tc"
,
torch
::
kCUDA
,
&
paged_attention_v2_opt_tc
);
ops
.
impl
(
"paged_attention_v2_opt_tc"
,
torch
::
kCUDA
,
&
paged_attention_v2_opt_tc
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment