Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8d09630a
Unverified
Commit
8d09630a
authored
Feb 11, 2026
by
gongchensu
Committed by
GitHub
Feb 11, 2026
Browse files
Merge branch 'demo131' into Issue/862
parents
ab52dead
012df56c
Changes
387
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
871 additions
and
171 deletions
+871
-171
src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
+56
-0
src/infinicore/ops/distributed/allreduce.cc
src/infinicore/ops/distributed/allreduce.cc
+50
-0
src/infinicore/ops/embedding/embedding.cc
src/infinicore/ops/embedding/embedding.cc
+16
-68
src/infinicore/ops/embedding/embedding_infiniop.cc
src/infinicore/ops/embedding/embedding_infiniop.cc
+44
-0
src/infinicore/ops/flash_attention/flash_attention.cc
src/infinicore/ops/flash_attention/flash_attention.cc
+31
-0
src/infinicore/ops/flash_attention/flash_attention_infiniop.cc
...nfinicore/ops/flash_attention/flash_attention_infiniop.cc
+55
-0
src/infinicore/ops/gemm/gemm.cc
src/infinicore/ops/gemm/gemm.cc
+9
-10
src/infinicore/ops/gemm/gemm_infiniop.cc
src/infinicore/ops/gemm/gemm_infiniop.cc
+39
-40
src/infinicore/ops/infiniop_impl.hpp
src/infinicore/ops/infiniop_impl.hpp
+73
-0
src/infinicore/ops/kv_caching/kv_caching.cc
src/infinicore/ops/kv_caching/kv_caching.cc
+42
-0
src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
+60
-0
src/infinicore/ops/linear/linear.cc
src/infinicore/ops/linear/linear.cc
+11
-9
src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
+60
-0
src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
+66
-0
src/infinicore/ops/mul/mul.cc
src/infinicore/ops/mul/mul.cc
+9
-10
src/infinicore/ops/mul/mul_infiniop.cc
src/infinicore/ops/mul/mul_infiniop.cc
+35
-34
src/infinicore/ops/paged_attention/paged_attention.cc
src/infinicore/ops/paged_attention/paged_attention.cc
+38
-0
src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
...nfinicore/ops/paged_attention/paged_attention_infiniop.cc
+68
-0
src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc
...re/ops/paged_attention_prefill/paged_attention_prefill.cc
+39
-0
src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc
...ged_attention_prefill/paged_attention_prefill_infiniop.cc
+70
-0
No files found.
src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
0 → 100644
View file @
8d09630a
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/dequantize_awq.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
dequantize_awq_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
DequantizeAWQ
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
x
,
x_packed
,
x_scale
,
x_zeros
;
};
void
*
plan
(
Tensor
x
,
const
Tensor
&
x_packed
,
const
Tensor
&
x_scale
,
const
Tensor
&
x_zeros
)
{
size_t
seed
=
hash_combine
(
x
,
x_packed
,
x_scale
,
x_zeros
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
DequantizeAWQ
,
seed
,
x
->
desc
(),
x_packed
->
desc
(),
x_scale
->
desc
(),
x_zeros
->
desc
());
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
DequantizeAWQ
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
x
),
graph
::
GraphTensor
(
x_packed
),
graph
::
GraphTensor
(
x_scale
),
graph
::
GraphTensor
(
x_zeros
)};
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopDequantizeAWQ
(
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
x
->
data
(),
planned
->
x_packed
->
data
(),
planned
->
x_scale
->
data
(),
planned
->
x_zeros
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
DequantizeAWQ
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::dequantize_awq_impl::infiniop
src/infinicore/ops/distributed/allreduce.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/distributed/allreduce.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
::
distributed
{
struct
PlannedMeta
{
graph
::
GraphTensor
output
,
input
;
infinicclReduceOp_t
op
;
infinicclComm_t
communicator
;
};
AllReduce
::
AllReduce
(
Tensor
output
,
const
Tensor
&
input
,
infinicclReduceOp_t
op
,
infinicclComm_t
communicator
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
output
,
input
);
INFINICORE_ASSERT
(
output
->
is_contiguous
()
&&
input
->
is_contiguous
());
INFINICORE_ASSERT
(
output
->
numel
()
==
input
->
numel
());
planned_meta_
=
new
PlannedMeta
{
graph
::
GraphTensor
(
output
),
graph
::
GraphTensor
(
input
),
op
,
communicator
};
}
AllReduce
::~
AllReduce
()
{
if
(
planned_meta_
)
{
PlannedMeta
*
meta
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta_
);
delete
meta
;
}
}
void
AllReduce
::
run
()
const
{
PlannedMeta
*
meta
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta_
);
INFINICORE_CHECK_ERROR
(
infinicclAllReduce
(
meta
->
input
->
data
(),
meta
->
output
->
data
(),
meta
->
input
->
numel
(),
static_cast
<
infiniDtype_t
>
(
static_cast
<
int
>
(
meta
->
input
->
dtype
())),
meta
->
op
,
meta
->
communicator
,
infinicore
::
context
::
getStream
()));
}
void
AllReduce
::
execute
(
Tensor
output
,
const
Tensor
&
input
,
infinicclReduceOp_t
op
,
infinicclComm_t
communicator
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
AllReduce
,
output
,
input
,
op
,
communicator
);
}
Tensor
allreduce
(
const
Tensor
&
input
,
infinicclReduceOp_t
op
,
infinicclComm_t
communicator
)
{
auto
output
=
Tensor
::
empty
(
input
->
shape
(),
input
->
dtype
(),
input
->
device
());
allreduce_
(
output
,
input
,
op
,
communicator
);
return
output
;
}
void
allreduce_
(
Tensor
output
,
const
Tensor
&
input
,
infinicclReduceOp_t
op
,
infinicclComm_t
communicator
)
{
AllReduce
::
execute
(
output
,
input
,
op
,
communicator
);
}
}
// namespace infinicore::op::distributed
src/infinicore/ops/embedding/embedding.cc
View file @
8d09630a
#include "infinicore/ops/embedding.hpp"
#include "infinicore/context/context.hpp"
#include
<cstring>
#include
"../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Embedding
);
Embedding
::
Embedding
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
input
,
weight
);
INFINICORE_GRAPH_OP_DISPATCH
(
out
->
device
().
getType
(),
out
,
input
,
weight
);
}
Tensor
embedding
(
Tensor
input
,
// LongTensor of arbitrary shape containing the indices to extract
Tensor
weight
// Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
void
Embedding
::
execute
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
Embedding
,
out
,
input
,
weight
);
}
Tensor
embedding
(
const
Tensor
&
input
,
// LongTensor of arbitrary shape containing the indices to extract
const
Tensor
&
weight
// Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
)
{
auto
input_shape
=
input
->
shape
();
auto
weight_shape
=
weight
->
shape
();
// auto vocab_size = weight_shape[0];
auto
embedding_dim
=
weight_shape
[
1
];
// Assign memory to out variables
...
...
@@ -21,69 +30,8 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
return
inputs_embeds
;
}
void
embedding_
(
Tensor
out
,
Tensor
input
,
Tensor
weight
)
{
assert
(
infinicore
::
DataType
::
I64
==
input
->
dtype
()
||
(
infinicore
::
DataType
::
I32
==
input
->
dtype
()));
assert
(
infinicore
::
Device
::
Type
::
CPU
==
input
->
device
().
getType
());
auto
input_shape
=
input
->
shape
();
auto
weight_shape
=
weight
->
shape
();
auto
embedding_dim
=
weight_shape
[
1
];
// Calculate the number of token
Size
counts
=
1
;
for
(
auto
&
v
:
input_shape
)
{
counts
*=
v
;
}
// the bytes of one token
const
Size
bytes
=
dsize
(
weight
->
dtype
())
*
embedding_dim
;
auto
*
weight_ptr
=
weight
->
data
();
auto
*
out_ptr
=
out
->
data
();
// copies
if
(
weight
->
device
().
getType
()
==
Device
::
Type
::
CPU
)
{
if
(
infinicore
::
DataType
::
I64
==
input
->
dtype
())
{
const
int64_t
*
input_arr
=
reinterpret_cast
<
const
int64_t
*>
(
input
->
data
());
for
(
Size
i
=
0
;
i
<
counts
;
++
i
)
{
int64_t
idx
=
input_arr
[
i
];
assert
((
idx
>=
0
)
&&
(
idx
<
weight_shape
[
0
]));
std
::
memcpy
(
out_ptr
+
i
*
bytes
,
weight_ptr
+
idx
*
bytes
,
bytes
);
}
}
else
if
(
infinicore
::
DataType
::
I32
==
input
->
dtype
())
{
const
int32_t
*
input_arr
=
reinterpret_cast
<
const
int32_t
*>
(
input
->
data
());
for
(
Size
i
=
0
;
i
<
counts
;
++
i
)
{
int32_t
idx
=
input_arr
[
i
];
assert
((
idx
>=
0
)
&&
(
idx
<
weight_shape
[
0
]));
std
::
memcpy
(
out_ptr
+
i
*
bytes
,
weight_ptr
+
idx
*
bytes
,
bytes
);
}
}
}
else
{
if
(
infinicore
::
DataType
::
I64
==
input
->
dtype
())
{
const
int64_t
*
input_arr
=
reinterpret_cast
<
const
int64_t
*>
(
input
->
data
());
for
(
Size
i
=
0
;
i
<
counts
;
++
i
)
{
int64_t
idx
=
input_arr
[
i
];
assert
((
idx
>=
0
)
&&
(
idx
<
weight_shape
[
0
]));
context
::
memcpyD2D
(
out_ptr
+
i
*
bytes
,
weight_ptr
+
idx
*
bytes
,
bytes
);
}
}
else
if
(
infinicore
::
DataType
::
I32
==
input
->
dtype
())
{
const
int32_t
*
input_arr
=
reinterpret_cast
<
const
int32_t
*>
(
input
->
data
());
for
(
Size
i
=
0
;
i
<
counts
;
++
i
)
{
int32_t
idx
=
input_arr
[
i
];
assert
((
idx
>=
0
)
&&
(
idx
<
weight_shape
[
0
]));
context
::
memcpyD2D
(
out_ptr
+
i
*
bytes
,
weight_ptr
+
idx
*
bytes
,
bytes
);
}
}
}
void
embedding_
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
Embedding
::
execute
(
out
,
input
,
weight
);
}
}
// namespace infinicore::op
src/infinicore/ops/embedding/embedding_infiniop.cc
0 → 100644
View file @
8d09630a
#include "../infiniop_impl.hpp"
#include "infinicore/ops/embedding.hpp"
namespace
infinicore
::
op
::
embedding_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Embedding
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
out
,
input
,
weight
;
};
void
*
plan
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
size_t
seed
=
hash_combine
(
out
,
input
,
weight
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
Embedding
,
seed
,
out
->
desc
(),
input
->
desc
(),
weight
->
desc
());
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
out
),
graph
::
GraphTensor
(
input
),
graph
::
GraphTensor
(
weight
)};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopEmbedding
(
planned
->
descriptor
->
desc
,
planned
->
out
->
data
(),
planned
->
input
->
data
(),
planned
->
weight
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Embedding
,
&
plan
,
&
run
,
cleanup
);
}
// namespace infinicore::op::embedding_impl::infiniop
src/infinicore/ops/flash_attention/flash_attention.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/flash_attention.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
FlashAttention
);
FlashAttention
::
FlashAttention
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
total_kv_len
,
float
scale
,
bool
is_causal
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
q
,
k
,
v
);
INFINICORE_GRAPH_OP_DISPATCH
(
out
->
device
().
getType
(),
out
,
q
,
k
,
v
,
total_kv_len
,
scale
,
is_causal
);
}
void
FlashAttention
::
execute
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
total_kv_len
,
float
scale
,
bool
is_causal
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
FlashAttention
,
out
,
q
,
k
,
v
,
total_kv_len
,
scale
,
is_causal
);
}
Tensor
flash_attention
(
const
Tensor
&
q
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
total_kv_len
,
float
scale
,
bool
is_causal
)
{
Shape
shape
=
q
->
shape
();
int
idx
=
shape
.
size
()
-
1
;
shape
[
idx
]
=
v
->
shape
()[
idx
];
auto
out
=
Tensor
::
empty
(
shape
,
q
->
dtype
(),
q
->
device
());
flash_attention_
(
out
,
q
,
k
,
v
,
total_kv_len
,
scale
,
is_causal
);
return
out
;
}
void
flash_attention_
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
total_kv_len
,
float
scale
,
bool
is_causal
)
{
FlashAttention
::
execute
(
out
,
q
,
k
,
v
,
total_kv_len
,
scale
,
is_causal
);
}
}
// namespace infinicore::op
src/infinicore/ops/flash_attention/flash_attention_infiniop.cc
0 → 100644
View file @
8d09630a
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/flash_attention.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
flash_attention_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
FlashAttention
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
out
,
q
,
k
,
v
,
total_kv_len
;
float
scale
;
bool
is_causal
;
};
void
*
plan
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
total_kv_len
,
float
scale
,
bool
is_causal
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k
,
v
,
total_kv_len
,
scale
,
is_causal
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
FlashAttention
,
seed
,
out
->
desc
(),
q
->
desc
(),
k
->
desc
(),
v
->
desc
(),
total_kv_len
->
desc
(),
scale
,
is_causal
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
FlashAttention
,
descriptor
);
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
out
),
graph
::
GraphTensor
(
q
),
graph
::
GraphTensor
(
k
),
graph
::
GraphTensor
(
v
),
graph
::
GraphTensor
(
total_kv_len
),
scale
,
is_causal
};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopFlashAttention
(
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
out
->
data
(),
planned
->
q
->
data
(),
planned
->
k
->
data
(),
planned
->
v
->
data
(),
planned
->
total_kv_len
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
FlashAttention
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::flash_attention_impl::infiniop
src/infinicore/ops/gemm/gemm.cc
View file @
8d09630a
...
...
@@ -3,19 +3,18 @@
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Gemm
);
common
::
OpDispatcher
<
Gemm
::
schema
>
&
Gemm
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Gemm
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
Gemm
::
execute
(
Tensor
c
,
Tensor
a
,
Tensor
b
,
float
alpha
,
float
beta
)
{
Gemm
::
Gemm
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
,
float
alpha
,
float
beta
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
c
,
a
,
b
);
infinicore
::
context
::
setDevice
(
c
->
device
());
dispatcher
().
lookup
(
c
->
device
().
getType
())(
c
,
a
,
b
,
alpha
,
beta
);
INFINICORE_GRAPH_OP_DISPATCH
(
c
->
device
().
getType
(),
c
,
a
,
b
,
alpha
,
beta
);
}
void
Gemm
::
execute
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
,
float
alpha
,
float
beta
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
Gemm
,
c
,
a
,
b
,
alpha
,
beta
);
}
Tensor
gemm
(
Tensor
a
,
Tensor
b
,
float
alpha
,
float
beta
)
{
Tensor
gemm
(
const
Tensor
&
a
,
const
Tensor
&
b
,
float
alpha
,
float
beta
)
{
Shape
shape
=
a
->
shape
();
Size
size
=
a
->
ndim
();
shape
[
size
-
1
]
=
b
->
size
(
size
-
1
);
...
...
@@ -24,7 +23,7 @@ Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
return
c
;
}
void
gemm_
(
Tensor
c
,
Tensor
a
,
Tensor
b
,
float
alpha
,
float
beta
)
{
void
gemm_
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
,
float
alpha
,
float
beta
)
{
Gemm
::
execute
(
c
,
a
,
b
,
alpha
,
beta
);
}
...
...
src/infinicore/ops/gemm/gemm_infiniop.cc
View file @
8d09630a
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/ops/gemm.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
gemm_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopGemmDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopGemmDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyGemmDescriptor
(
desc
))
;
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
c
,
Tensor
a
,
Tensor
b
,
float
alpha
,
float
beta
)
{
size_t
seed
=
hash_combine
(
c
,
b
,
a
,
alpha
,
beta
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopGemmDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateGemmDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetGemmWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Gemm
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
c
,
a
,
b
;
float
alpha
,
beta
;
}
;
void
*
plan
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
,
float
alpha
,
float
beta
)
{
size_t
seed
=
hash_combine
(
c
,
a
,
b
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
Gemm
,
seed
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
Gemm
,
descriptor
);
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
c
)
,
graph
::
GraphTensor
(
a
),
graph
::
GraphTensor
(
b
),
alpha
,
beta
};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopGemm
(
desc
,
workspace
->
data
(),
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
alpha
,
beta
,
context
::
getStream
()));
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
c
->
data
(),
planned
->
a
->
data
(),
planned
->
b
->
data
(),
planned
->
alpha
,
planned
->
beta
,
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
Gemm
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Gemm
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::gemm_impl::infiniop
src/infinicore/ops/infiniop_impl.hpp
0 → 100644
View file @
8d09630a
#pragma once
#include "../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
struct __DESC_TYPE__ { \
infiniop##__OP_NAME__##Descriptor_t desc = nullptr; \
\
explicit __DESC_TYPE__(infiniop##__OP_NAME__##Descriptor_t d) \
: desc(d) {} \
\
/* non-copyable */
\
__DESC_TYPE__(const __DESC_TYPE__ &) = delete; \
__DESC_TYPE__ &operator=(const __DESC_TYPE__ &) = delete; \
\
/* movable */
\
__DESC_TYPE__(__DESC_TYPE__ &&other) noexcept \
: desc(other.desc) { \
other.desc = nullptr; \
} \
\
__DESC_TYPE__ &operator=(__DESC_TYPE__ &&other) noexcept { \
if (this != &other) { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
} \
desc = other.desc; \
other.desc = nullptr; \
} \
return *this; \
} \
\
~__DESC_TYPE__() { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
} \
} \
}; \
\
thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>> \
caches( \
__SIZE__, \
[](std::shared_ptr<__DESC_TYPE__> &desc) { \
desc = nullptr; \
});
#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
std::shared_ptr<__DESC_TYPE__> __DESC_NAME__; \
{ \
auto device__ = context::getDevice(); \
auto &cache__ = caches.getCache(device__); \
__DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr); \
if (!__DESC_NAME__) { \
__DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr); \
INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor( \
context::getInfiniopHandle(device__), \
&__DESC_NAME__->desc, \
__VA_ARGS__)); \
cache__.put(__HASH_KEY__, __DESC_NAME__); \
} \
}
#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__) \
Tensor __TENSOR_NAME__; \
{ \
auto device__ = context::getDevice(); \
size_t workspace_size = 0; \
INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \
__TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__); \
}
src/infinicore/ops/kv_caching/kv_caching.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/kv_caching.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
KVCaching
);
KVCaching
::
KVCaching
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
INFINICORE_GRAPH_OP_DISPATCH
(
k_cache
->
device
().
getType
(),
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
}
void
KVCaching
::
execute
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
KVCaching
,
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
}
void
kv_caching_
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
KVCaching
::
execute
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
}
}
// namespace infinicore::op
src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
0 → 100644
View file @
8d09630a
#include "../infiniop_impl.hpp"
#include "infinicore/ops/kv_caching.hpp"
namespace
infinicore
::
op
::
kv_caching_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
KVCaching
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
;
};
void
*
plan
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
size_t
seed
=
hash_combine
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
KVCaching
,
seed
,
k_cache
->
desc
(),
v_cache
->
desc
(),
k
->
desc
(),
v
->
desc
(),
past_kv_lengths
->
desc
());
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
KVCaching
,
descriptor
);
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
k_cache
),
graph
::
GraphTensor
(
v_cache
),
graph
::
GraphTensor
(
k
),
graph
::
GraphTensor
(
v
),
graph
::
GraphTensor
(
past_kv_lengths
)};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopKVCaching
(
planned
->
descriptor
->
desc
,
nullptr
,
0
,
planned
->
k_cache
->
data
(),
planned
->
v_cache
->
data
(),
planned
->
k
->
data
(),
planned
->
v
->
data
(),
planned
->
past_kv_lengths
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
KVCaching
,
&
plan
,
&
run
,
cleanup
);
}
// namespace infinicore::op::kv_caching_impl::infiniop
src/infinicore/ops/linear/linear.cc
View file @
8d09630a
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/
add
.hpp"
#include "infinicore/ops/
matmul
.hpp"
#include "infinicore/ops/
gemm
.hpp"
#include "infinicore/ops/
rearrange
.hpp"
namespace
infinicore
::
op
{
...
...
@@ -42,16 +42,18 @@ void linear_(Tensor out,
// linear transformation
Tensor
out_view
=
out
->
view
({
N
,
out_features
});
matmul_
(
out_view
,
input
->
view
({
N
,
in_features
}),
weight
->
permute
({
1
,
0
}));
// Add bias
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
if
(
bias
.
has_value
())
{
add
_
(
out_view
,
out_view
,
bias
.
value
()
->
as_strided
({
N
,
out_features
},
{
0
,
1
}))
;
rearrange
_
(
out_view
,
bias
.
value
()
->
as_strided
({
N
,
out_features
},
{
0
,
1
}));
beta
=
1.0
f
;
}
gemm_
(
out_view
,
input
->
view
({
N
,
in_features
}),
weight
->
permute
({
1
,
0
}),
alpha
,
beta
);
}
}
// namespace infinicore::op
src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/linear_w4a16_awq.hpp"
#include "infinicore/ops/dequantize_awq.hpp"
#include "infinicore/ops/gemm.hpp"
namespace
infinicore
::
op
{
Tensor
linear_w4a16_awq
(
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
Tensor
weight_zeros
,
std
::
optional
<
Tensor
>
bias
)
{
// Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
Size
ndim
=
input
->
ndim
();
Size
out_features
=
weight_packed
->
shape
()[
0
];
// Assign memory to out variables
auto
output_shape
=
input
->
shape
();
output_shape
[
ndim
-
1
]
=
out_features
;
auto
out
=
Tensor
::
empty
(
output_shape
,
input
->
dtype
(),
input
->
device
());
// Inplace Calculate
linear_w4a16_awq_
(
out
,
input
,
weight_packed
,
weight_scale
,
weight_zeros
,
bias
);
return
out
;
}
void
linear_w4a16_awq_
(
Tensor
out
,
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
Tensor
weight_zeros
,
std
::
optional
<
Tensor
>
bias
)
{
auto
weight_packed_shape
=
weight_packed
->
shape
();
Size
out_features
=
weight_packed_shape
[
0
];
Size
in_features
=
weight_packed_shape
[
1
];
Size
ndim
=
input
->
ndim
();
assert
(
out
->
ndim
()
==
ndim
);
Size
N
=
1
;
auto
input_shape
=
input
->
shape
();
for
(
size_t
i
=
0
;
i
<
ndim
-
1
;
++
i
)
{
N
*=
input_shape
[
i
];
}
auto
weight
=
Tensor
::
empty
(
{
out_features
,
in_features
},
out
->
dtype
(),
weight_packed
->
device
());
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
op
::
dequantize_awq_
(
weight
,
weight_packed
,
weight_scale
,
weight_zeros
);
bias
=
std
::
make_optional
(
bias
.
value
()
->
as_strided
({
N
,
out_features
},
{
0
,
1
}));
gemm_
(
out
->
view
({
N
,
out_features
}),
input
->
view
({
N
,
in_features
}),
weight
->
permute
({
1
,
0
}),
alpha
,
beta
);
}
}
// namespace infinicore::op
src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/linear_w8a8i8.hpp"
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include "infinicore/ops/scaled_mm_i8.hpp"
namespace
infinicore
::
op
{
Tensor
linear_w8a8i8
(
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
std
::
optional
<
Tensor
>
bias
)
{
// Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
Size
ndim
=
input
->
ndim
();
Size
out_features
=
weight_packed
->
shape
()[
0
];
// Assign memory to out variables
auto
output_shape
=
input
->
shape
();
output_shape
[
ndim
-
1
]
=
out_features
;
auto
out
=
Tensor
::
empty
(
output_shape
,
input
->
dtype
(),
input
->
device
());
// Inplace Calculate
linear_w8a8i8_
(
out
,
input
,
weight_packed
,
weight_scale
,
bias
);
return
out
;
}
void
linear_w8a8i8_
(
Tensor
out
,
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
std
::
optional
<
Tensor
>
bias
)
{
auto
weight_packed_shape
=
weight_packed
->
shape
();
Size
out_features
=
weight_packed_shape
[
0
];
Size
in_features
=
weight_packed_shape
[
1
];
Size
ndim
=
input
->
ndim
();
assert
(
out
->
ndim
()
==
ndim
);
Size
N
=
1
;
auto
input_shape
=
input
->
shape
();
for
(
size_t
i
=
0
;
i
<
ndim
-
1
;
++
i
)
{
N
*=
input_shape
[
i
];
}
auto
input_packed
=
Tensor
::
empty
(
{
N
,
input_shape
[
ndim
-
1
]},
DataType
::
I8
,
input
->
device
());
auto
input_scale
=
Tensor
::
empty
(
{
N
,
1
},
DataType
::
F32
,
input
->
device
());
op
::
per_channel_quant_i8_
(
input
->
view
({
N
,
in_features
}),
input_packed
,
input_scale
);
if
(
bias
.
has_value
())
{
bias
=
std
::
make_optional
(
bias
.
value
()
->
as_strided
({
N
,
out_features
},
{
0
,
1
}));
}
op
::
scaled_mm_i8_
(
out
->
view
({
N
,
out_features
}),
input_packed
,
input_scale
,
weight_packed
->
permute
({
1
,
0
}),
weight_scale
,
bias
);
}
}
// namespace infinicore::op
src/infinicore/ops/mul/mul.cc
View file @
8d09630a
#include "infinicore/ops/mul.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Mul
::
schema
>
&
Mul
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Mul
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Mul
);
void
Mul
::
execute
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
Mul
::
Mul
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
c
,
a
,
b
);
infinicore
::
context
::
setDevice
(
c
->
device
());
dispatcher
().
lookup
(
c
->
device
().
getType
())(
c
,
a
,
b
);
INFINICORE_GRAPH_OP_DISPATCH
(
c
->
device
().
getType
(),
c
,
a
,
b
);
}
void
Mul
::
execute
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
Mul
,
c
,
a
,
b
);
}
Tensor
mul
(
Tensor
a
,
Tensor
b
)
{
Tensor
mul
(
const
Tensor
&
a
,
const
Tensor
&
b
)
{
auto
c
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
mul_
(
c
,
a
,
b
);
return
c
;
}
void
mul_
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
void
mul_
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
Mul
::
execute
(
c
,
a
,
b
);
}
...
...
src/infinicore/ops/mul/mul_infiniop.cc
View file @
8d09630a
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/mul.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
mul_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopMulDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopMulDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyMulDescriptor
(
desc
));
desc
=
nullptr
;
}
});
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Mul
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
c
,
a
,
b
;
};
void
calculate
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
void
*
plan
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
size_t
seed
=
hash_combine
(
c
,
b
,
a
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
Mul
,
seed
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
());
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopMulDescriptor_t
desc
=
nullptr
;
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
Mul
,
descriptor
);
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateMulDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
c
),
graph
::
GraphTensor
(
a
),
graph
::
GraphTensor
(
b
)};
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetMulWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopMul
(
desc
,
workspace
->
data
(),
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
context
::
getStream
()));
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
c
->
data
(),
planned
->
a
->
data
(),
planned
->
b
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
Mul
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Mul
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::mul_impl::infiniop
src/infinicore/ops/paged_attention/paged_attention.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/paged_attention.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
PagedAttention
);
PagedAttention
::
PagedAttention
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
);
INFINICORE_GRAPH_OP_DISPATCH
(
out
->
device
().
getType
(),
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
}
void
PagedAttention
::
execute
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
PagedAttention
,
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
}
Tensor
paged_attention
(
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
auto
out
=
Tensor
::
empty
(
q
->
shape
(),
q
->
dtype
(),
q
->
device
());
paged_attention_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
return
out
;
}
void
paged_attention_
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
PagedAttention
::
execute
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
}
}
// namespace infinicore::op
src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/paged_attention.hpp"
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
paged_attention_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
PagedAttention
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache_lens
;
std
::
optional
<
graph
::
GraphTensor
>
alibi_slopes
;
float
scale
;
};
void
*
plan
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
cache_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache_lens
,
alibi_slopes
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
PagedAttention
,
seed
,
out
->
desc
(),
q
->
desc
(),
k_cache
->
desc
(),
v_cache
->
desc
(),
block_tables
->
desc
(),
cache_lens
->
desc
(),
alibi_slopes
?
alibi_slopes
.
value
()
->
desc
()
:
nullptr
,
scale
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
PagedAttention
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
out
),
graph
::
GraphTensor
(
q
),
graph
::
GraphTensor
(
k_cache
),
graph
::
GraphTensor
(
v_cache
),
graph
::
GraphTensor
(
block_tables
),
graph
::
GraphTensor
(
cache_lens
),
alibi_slopes
?
std
::
optional
<
graph
::
GraphTensor
>
(
graph
::
GraphTensor
(
*
alibi_slopes
))
:
std
::
nullopt
,
scale
};
}
void
run
(
void
*
planned_meta
)
{
auto
*
p
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopPagedAttention
(
p
->
descriptor
->
desc
,
p
->
workspace
->
data
(),
p
->
workspace
->
numel
(),
p
->
out
->
data
(),
p
->
q
->
data
(),
p
->
k_cache
->
data
(),
p
->
v_cache
->
data
(),
p
->
block_tables
->
data
(),
p
->
cache_lens
->
data
(),
p
->
alibi_slopes
.
has_value
()
?
p
->
alibi_slopes
.
value
()
->
data
()
:
nullptr
,
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
PagedAttention
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::paged_attention_impl::infiniop
src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/paged_attention_prefill.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
PagedAttentionPrefill
::
schema
>
&
PagedAttentionPrefill
::
dispatcher
()
{
static
common
::
OpDispatcher
<
PagedAttentionPrefill
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
PagedAttentionPrefill
::
execute
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
Tensor
cum_seqlens_q
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
cum_seqlens_q
);
infinicore
::
context
::
setDevice
(
out
->
device
());
dispatcher
().
lookup
(
out
->
device
().
getType
())(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
cum_seqlens_q
,
alibi_slopes
,
scale
);
}
Tensor
paged_attention_prefill
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
Tensor
cum_seqlens_q
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
auto
out
=
Tensor
::
empty
(
q
->
shape
(),
q
->
dtype
(),
q
->
device
());
paged_attention_prefill_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
cum_seqlens_q
,
alibi_slopes
,
scale
);
return
out
;
}
void
paged_attention_prefill_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
Tensor
cum_seqlens_q
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
PagedAttentionPrefill
::
execute
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
cum_seqlens_q
,
alibi_slopes
,
scale
);
}
}
// namespace infinicore::op
src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc
0 → 100644
View file @
8d09630a
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/paged_attention_prefill.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
paged_attention_prefill_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopPagedAttentionPrefillDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopPagedAttentionPrefillDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyPagedAttentionPrefillDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
Tensor
cum_seqlens_q
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
cum_seqlens_q
,
alibi_slopes
,
scale
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopPagedAttentionPrefillDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreatePagedAttentionPrefillDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
out
->
desc
(),
q
->
desc
(),
k_cache
->
desc
(),
v_cache
->
desc
(),
block_tables
->
desc
(),
kv_lens
->
desc
(),
cum_seqlens_q
->
desc
(),
alibi_slopes
.
has_value
()
?
alibi_slopes
.
value
()
->
desc
()
:
nullptr
,
scale
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetPagedAttentionPrefillWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopPagedAttentionPrefill
(
desc
,
workspace
->
data
(),
workspace_size
,
out
->
data
(),
q
->
data
(),
k_cache
->
data
(),
v_cache
->
data
(),
block_tables
->
data
(),
kv_lens
->
data
(),
cum_seqlens_q
->
data
(),
alibi_slopes
.
has_value
()
?
alibi_slopes
.
value
()
->
data
()
:
nullptr
,
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
PagedAttentionPrefill
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::paged_attention_prefill_impl::infiniop
Prev
1
2
3
4
5
6
7
8
9
10
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment