Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
784139b9
Unverified
Commit
784139b9
authored
Feb 13, 2026
by
thatPepe
Committed by
GitHub
Feb 13, 2026
Browse files
Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
parents
3c8fb3c0
1d6527cb
Changes
582
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
710 additions
and
326 deletions
+710
-326
src/infinicore/ops/gemm/gemm_infiniop.cc
src/infinicore/ops/gemm/gemm_infiniop.cc
+1
-1
src/infinicore/ops/infiniop_impl.hpp
src/infinicore/ops/infiniop_impl.hpp
+40
-17
src/infinicore/ops/kv_caching/kv_caching.cc
src/infinicore/ops/kv_caching/kv_caching.cc
+42
-0
src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
+60
-0
src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
+60
-0
src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
+66
-0
src/infinicore/ops/mul/mul.cc
src/infinicore/ops/mul/mul.cc
+9
-10
src/infinicore/ops/mul/mul_infiniop.cc
src/infinicore/ops/mul/mul_infiniop.cc
+35
-34
src/infinicore/ops/paged_attention/paged_attention.cc
src/infinicore/ops/paged_attention/paged_attention.cc
+20
-10
src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
...nfinicore/ops/paged_attention/paged_attention_infiniop.cc
+60
-46
src/infinicore/ops/paged_caching/paged_caching.cc
src/infinicore/ops/paged_caching/paged_caching.cc
+8
-9
src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
+49
-42
src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
...finicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
+20
-0
src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
...ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
+56
-0
src/infinicore/ops/rearrange/rearrange.cc
src/infinicore/ops/rearrange/rearrange.cc
+15
-9
src/infinicore/ops/rearrange/rearrange_infiniop.cc
src/infinicore/ops/rearrange/rearrange_infiniop.cc
+29
-30
src/infinicore/ops/rms_norm/rms_norm.cc
src/infinicore/ops/rms_norm/rms_norm.cc
+9
-11
src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
+42
-37
src/infinicore/ops/rope/rope.cc
src/infinicore/ops/rope/rope.cc
+28
-21
src/infinicore/ops/rope/rope_infiniop.cc
src/infinicore/ops/rope/rope_infiniop.cc
+61
-49
No files found.
src/infinicore/ops/gemm/gemm_infiniop.cc
View file @
784139b9
...
...
@@ -11,7 +11,7 @@ struct PlannedMeta {
float
alpha
,
beta
;
};
void
*
plan
(
Tensor
c
,
Tensor
a
,
Tensor
b
,
float
alpha
,
float
beta
)
{
void
*
plan
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
,
float
alpha
,
float
beta
)
{
size_t
seed
=
hash_combine
(
c
,
a
,
b
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
...
...
src/infinicore/ops/infiniop_impl.hpp
View file @
784139b9
...
...
@@ -5,23 +5,46 @@
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
struct __DESC_TYPE__ { \
infiniop##__OP_NAME__##Descriptor_t desc; \
Descriptor(infiniop##__OP_NAME__##Descriptor_t desc) : desc(desc) {} \
~Descriptor() { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
desc = nullptr; \
} \
} \
}; \
\
thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>> \
caches( \
__SIZE__, \
[](std::shared_ptr<__DESC_TYPE__> &desc) { \
desc = nullptr; \
#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
struct __DESC_TYPE__ { \
infiniop##__OP_NAME__##Descriptor_t desc = nullptr; \
\
explicit __DESC_TYPE__(infiniop##__OP_NAME__##Descriptor_t d) \
: desc(d) {} \
\
/* non-copyable */
\
__DESC_TYPE__(const __DESC_TYPE__ &) = delete; \
__DESC_TYPE__ &operator=(const __DESC_TYPE__ &) = delete; \
\
/* movable */
\
__DESC_TYPE__(__DESC_TYPE__ &&other) noexcept \
: desc(other.desc) { \
other.desc = nullptr; \
} \
\
__DESC_TYPE__ &operator=(__DESC_TYPE__ &&other) noexcept { \
if (this != &other) { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
} \
desc = other.desc; \
other.desc = nullptr; \
} \
return *this; \
} \
\
~__DESC_TYPE__() { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
} \
} \
}; \
\
thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>> \
caches( \
__SIZE__, \
[](std::shared_ptr<__DESC_TYPE__> &desc) { \
desc = nullptr; \
});
#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
...
...
src/infinicore/ops/kv_caching/kv_caching.cc
0 → 100644
View file @
784139b9
#include "infinicore/ops/kv_caching.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
KVCaching
);
KVCaching
::
KVCaching
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
INFINICORE_GRAPH_OP_DISPATCH
(
k_cache
->
device
().
getType
(),
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
}
void
KVCaching
::
execute
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
KVCaching
,
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
}
void
kv_caching_
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
KVCaching
::
execute
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
}
}
// namespace infinicore::op
src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
0 → 100644
View file @
784139b9
#include "../infiniop_impl.hpp"
#include "infinicore/ops/kv_caching.hpp"
namespace
infinicore
::
op
::
kv_caching_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
KVCaching
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
;
};
void
*
plan
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
past_kv_lengths
)
{
size_t
seed
=
hash_combine
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
KVCaching
,
seed
,
k_cache
->
desc
(),
v_cache
->
desc
(),
k
->
desc
(),
v
->
desc
(),
past_kv_lengths
->
desc
());
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
KVCaching
,
descriptor
);
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
k_cache
),
graph
::
GraphTensor
(
v_cache
),
graph
::
GraphTensor
(
k
),
graph
::
GraphTensor
(
v
),
graph
::
GraphTensor
(
past_kv_lengths
)};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopKVCaching
(
planned
->
descriptor
->
desc
,
nullptr
,
0
,
planned
->
k_cache
->
data
(),
planned
->
v_cache
->
data
(),
planned
->
k
->
data
(),
planned
->
v
->
data
(),
planned
->
past_kv_lengths
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
KVCaching
,
&
plan
,
&
run
,
cleanup
);
}
// namespace infinicore::op::kv_caching_impl::infiniop
src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
0 → 100644
View file @
784139b9
#include "infinicore/ops/linear_w4a16_awq.hpp"
#include "infinicore/ops/dequantize_awq.hpp"
#include "infinicore/ops/gemm.hpp"
namespace
infinicore
::
op
{
Tensor
linear_w4a16_awq
(
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
Tensor
weight_zeros
,
std
::
optional
<
Tensor
>
bias
)
{
// Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
Size
ndim
=
input
->
ndim
();
Size
out_features
=
weight_packed
->
shape
()[
0
];
// Assign memory to out variables
auto
output_shape
=
input
->
shape
();
output_shape
[
ndim
-
1
]
=
out_features
;
auto
out
=
Tensor
::
empty
(
output_shape
,
input
->
dtype
(),
input
->
device
());
// Inplace Calculate
linear_w4a16_awq_
(
out
,
input
,
weight_packed
,
weight_scale
,
weight_zeros
,
bias
);
return
out
;
}
void
linear_w4a16_awq_
(
Tensor
out
,
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
Tensor
weight_zeros
,
std
::
optional
<
Tensor
>
bias
)
{
auto
weight_packed_shape
=
weight_packed
->
shape
();
Size
out_features
=
weight_packed_shape
[
0
];
Size
in_features
=
weight_packed_shape
[
1
];
Size
ndim
=
input
->
ndim
();
assert
(
out
->
ndim
()
==
ndim
);
Size
N
=
1
;
auto
input_shape
=
input
->
shape
();
for
(
size_t
i
=
0
;
i
<
ndim
-
1
;
++
i
)
{
N
*=
input_shape
[
i
];
}
auto
weight
=
Tensor
::
empty
(
{
out_features
,
in_features
},
out
->
dtype
(),
weight_packed
->
device
());
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
op
::
dequantize_awq_
(
weight
,
weight_packed
,
weight_scale
,
weight_zeros
);
bias
=
std
::
make_optional
(
bias
.
value
()
->
as_strided
({
N
,
out_features
},
{
0
,
1
}));
gemm_
(
out
->
view
({
N
,
out_features
}),
input
->
view
({
N
,
in_features
}),
weight
->
permute
({
1
,
0
}),
alpha
,
beta
);
}
}
// namespace infinicore::op
src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
0 → 100644
View file @
784139b9
#include "infinicore/ops/linear_w8a8i8.hpp"
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include "infinicore/ops/scaled_mm_i8.hpp"
namespace
infinicore
::
op
{
Tensor
linear_w8a8i8
(
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
std
::
optional
<
Tensor
>
bias
)
{
// Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
Size
ndim
=
input
->
ndim
();
Size
out_features
=
weight_packed
->
shape
()[
0
];
// Assign memory to out variables
auto
output_shape
=
input
->
shape
();
output_shape
[
ndim
-
1
]
=
out_features
;
auto
out
=
Tensor
::
empty
(
output_shape
,
input
->
dtype
(),
input
->
device
());
// Inplace Calculate
linear_w8a8i8_
(
out
,
input
,
weight_packed
,
weight_scale
,
bias
);
return
out
;
}
void
linear_w8a8i8_
(
Tensor
out
,
Tensor
input
,
Tensor
weight_packed
,
Tensor
weight_scale
,
std
::
optional
<
Tensor
>
bias
)
{
auto
weight_packed_shape
=
weight_packed
->
shape
();
Size
out_features
=
weight_packed_shape
[
0
];
Size
in_features
=
weight_packed_shape
[
1
];
Size
ndim
=
input
->
ndim
();
assert
(
out
->
ndim
()
==
ndim
);
Size
N
=
1
;
auto
input_shape
=
input
->
shape
();
for
(
size_t
i
=
0
;
i
<
ndim
-
1
;
++
i
)
{
N
*=
input_shape
[
i
];
}
auto
input_packed
=
Tensor
::
empty
(
{
N
,
input_shape
[
ndim
-
1
]},
DataType
::
I8
,
input
->
device
());
auto
input_scale
=
Tensor
::
empty
(
{
N
,
1
},
DataType
::
F32
,
input
->
device
());
op
::
per_channel_quant_i8_
(
input
->
view
({
N
,
in_features
}),
input_packed
,
input_scale
);
if
(
bias
.
has_value
())
{
bias
=
std
::
make_optional
(
bias
.
value
()
->
as_strided
({
N
,
out_features
},
{
0
,
1
}));
}
op
::
scaled_mm_i8_
(
out
->
view
({
N
,
out_features
}),
input_packed
,
input_scale
,
weight_packed
->
permute
({
1
,
0
}),
weight_scale
,
bias
);
}
}
// namespace infinicore::op
src/infinicore/ops/mul/mul.cc
View file @
784139b9
#include "infinicore/ops/mul.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Mul
::
schema
>
&
Mul
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Mul
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Mul
);
void
Mul
::
execute
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
Mul
::
Mul
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
c
,
a
,
b
);
infinicore
::
context
::
setDevice
(
c
->
device
());
dispatcher
().
lookup
(
c
->
device
().
getType
())(
c
,
a
,
b
);
INFINICORE_GRAPH_OP_DISPATCH
(
c
->
device
().
getType
(),
c
,
a
,
b
);
}
void
Mul
::
execute
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
Mul
,
c
,
a
,
b
);
}
Tensor
mul
(
Tensor
a
,
Tensor
b
)
{
Tensor
mul
(
const
Tensor
&
a
,
const
Tensor
&
b
)
{
auto
c
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
mul_
(
c
,
a
,
b
);
return
c
;
}
void
mul_
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
void
mul_
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
Mul
::
execute
(
c
,
a
,
b
);
}
...
...
src/infinicore/ops/mul/mul_infiniop.cc
View file @
784139b9
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/mul.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
mul_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopMulDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopMulDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyMulDescriptor
(
desc
));
desc
=
nullptr
;
}
});
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Mul
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
c
,
a
,
b
;
};
void
calculate
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
void
*
plan
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
size_t
seed
=
hash_combine
(
c
,
b
,
a
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
Mul
,
seed
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
());
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopMulDescriptor_t
desc
=
nullptr
;
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
Mul
,
descriptor
);
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateMulDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
c
),
graph
::
GraphTensor
(
a
),
graph
::
GraphTensor
(
b
)};
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetMulWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopMul
(
desc
,
workspace
->
data
(),
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
context
::
getStream
()));
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
c
->
data
(),
planned
->
a
->
data
(),
planned
->
b
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
Mul
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Mul
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::mul_impl::infiniop
src/infinicore/ops/paged_attention/paged_attention.cc
View file @
784139b9
#include "infinicore/ops/paged_attention.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
PagedAttention
::
schema
>
&
PagedAttention
::
dispatcher
()
{
static
common
::
OpDispatcher
<
PagedAttention
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
PagedAttention
);
void
PagedAttention
::
execute
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
PagedAttention
::
PagedAttention
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
);
infinicore
::
context
::
setDevice
(
out
->
device
());
dispatcher
().
lookup
(
out
->
device
().
getType
())(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
INFINICORE_GRAPH_OP_DISPATCH
(
out
->
device
().
getType
(),
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
}
void
PagedAttention
::
execute
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
PagedAttention
,
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
}
Tensor
paged_attention
(
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
Tensor
paged_attention
(
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
auto
out
=
Tensor
::
empty
(
q
->
shape
(),
q
->
dtype
(),
q
->
device
());
paged_attention_
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
return
out
;
}
void
paged_attention_
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
void
paged_attention_
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
PagedAttention
::
execute
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
}
...
...
src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
View file @
784139b9
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/paged_attention.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
paged_attention_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopPagedAttentionDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopPagedAttentionDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyPagedAttentionDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
out
,
Tensor
q
,
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
block_tables
,
Tensor
kv_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
kv_lens
,
alibi_slopes
,
scale
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopPagedAttentionDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreatePagedAttentionDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
out
->
desc
(),
q
->
desc
(),
k_cache
->
desc
(),
v_cache
->
desc
(),
block_tables
->
desc
(),
kv_lens
->
desc
(),
alibi_slopes
.
has_value
()
?
alibi_slopes
.
value
()
->
desc
()
:
nullptr
,
scale
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetPagedAttentionWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopPagedAttention
(
desc
,
workspace
->
data
(),
workspace_size
,
out
->
data
(),
q
->
data
(),
k_cache
->
data
(),
v_cache
->
data
(),
block_tables
->
data
(),
kv_lens
->
data
(),
alibi_slopes
.
has_value
()
?
alibi_slopes
.
value
()
->
data
()
:
nullptr
,
context
::
getStream
()));
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
PagedAttention
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache_lens
;
std
::
optional
<
graph
::
GraphTensor
>
alibi_slopes
;
float
scale
;
};
void
*
plan
(
Tensor
out
,
const
Tensor
&
q
,
const
Tensor
&
k_cache
,
const
Tensor
&
v_cache
,
const
Tensor
&
block_tables
,
const
Tensor
&
cache_lens
,
std
::
optional
<
Tensor
>
alibi_slopes
,
float
scale
)
{
size_t
seed
=
hash_combine
(
out
,
q
,
k_cache
,
v_cache
,
block_tables
,
cache_lens
,
alibi_slopes
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
PagedAttention
,
seed
,
out
->
desc
(),
q
->
desc
(),
k_cache
->
desc
(),
v_cache
->
desc
(),
block_tables
->
desc
(),
cache_lens
->
desc
(),
alibi_slopes
?
alibi_slopes
.
value
()
->
desc
()
:
nullptr
,
scale
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
PagedAttention
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
out
),
graph
::
GraphTensor
(
q
),
graph
::
GraphTensor
(
k_cache
),
graph
::
GraphTensor
(
v_cache
),
graph
::
GraphTensor
(
block_tables
),
graph
::
GraphTensor
(
cache_lens
),
alibi_slopes
?
std
::
optional
<
graph
::
GraphTensor
>
(
graph
::
GraphTensor
(
*
alibi_slopes
))
:
std
::
nullopt
,
scale
};
}
void
run
(
void
*
planned_meta
)
{
auto
*
p
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopPagedAttention
(
p
->
descriptor
->
desc
,
p
->
workspace
->
data
(),
p
->
workspace
->
numel
(),
p
->
out
->
data
(),
p
->
q
->
data
(),
p
->
k_cache
->
data
(),
p
->
v_cache
->
data
(),
p
->
block_tables
->
data
(),
p
->
cache_lens
->
data
(),
p
->
alibi_slopes
.
has_value
()
?
p
->
alibi_slopes
.
value
()
->
data
()
:
nullptr
,
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
PagedAttention
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
PagedAttention
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::paged_attention_impl::infiniop
src/infinicore/ops/paged_caching/paged_caching.cc
View file @
784139b9
#include "infinicore/ops/paged_caching.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
PagedCaching
::
schema
>
&
PagedCaching
::
dispatcher
()
{
static
common
::
OpDispatcher
<
PagedCaching
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
PagedCaching
);
void
PagedCaching
::
execute
(
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
k
,
Tensor
v
,
Tensor
slot_mapping
)
{
PagedCaching
::
PagedCaching
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
slot_mapping
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
infinicore
::
context
::
setDevice
(
k_cache
->
device
());
dispatcher
().
lookup
(
k_cache
->
device
().
getType
())(
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
INFINICORE_GRAPH_OP_DISPATCH
(
k
->
device
().
getType
(),
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
}
void
PagedCaching
::
execute
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
slot_mapping
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
PagedCaching
,
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
}
void
paged_caching_
(
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
k
,
Tensor
v
,
Tensor
slot_mapping
)
{
void
paged_caching_
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
slot_mapping
)
{
PagedCaching
::
execute
(
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
}
...
...
src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
View file @
784139b9
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/paged_caching.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
paged_caching_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopPagedCachingDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopPagedCachingDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyPagedCachingDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
k_cache
,
Tensor
v_cache
,
Tensor
k
,
Tensor
v
,
Tensor
slot_mapping
)
{
size_t
seed
=
hash_combine
(
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopPagedCachingDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreatePagedCachingDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
k_cache
->
desc
(),
v_cache
->
desc
(),
k
->
desc
(),
v
->
desc
(),
slot_mapping
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetPagedCachingWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopPagedCaching
(
desc
,
workspace
->
data
(),
workspace_size
,
k_cache
->
data
(),
v_cache
->
data
(),
k
->
data
(),
v
->
data
(),
slot_mapping
->
data
(),
context
::
getStream
()));
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
PagedCaching
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
;
};
void
*
plan
(
Tensor
k_cache
,
Tensor
v_cache
,
const
Tensor
&
k
,
const
Tensor
&
v
,
const
Tensor
&
slot_mapping
)
{
size_t
key
=
hash_combine
(
k_cache
,
v_cache
,
k
,
v
,
slot_mapping
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
PagedCaching
,
key
,
k_cache
->
desc
(),
v_cache
->
desc
(),
k
->
desc
(),
v
->
desc
(),
slot_mapping
->
desc
());
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
PagedCaching
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
k_cache
),
graph
::
GraphTensor
(
v_cache
),
graph
::
GraphTensor
(
k
),
graph
::
GraphTensor
(
v
),
graph
::
GraphTensor
(
slot_mapping
)};
}
void
run
(
void
*
planned_meta
)
{
auto
*
p
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopPagedCaching
(
p
->
descriptor
->
desc
,
p
->
workspace
->
data
(),
p
->
workspace
->
numel
(),
p
->
k_cache
->
data
(),
p
->
v_cache
->
data
(),
p
->
k
->
data
(),
p
->
v
->
data
(),
p
->
slot_mapping
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
PagedCaching
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
PagedCaching
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::paged_caching_impl::infiniop
src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
0 → 100644
View file @
784139b9
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
PerChannelQuantI8
);
PerChannelQuantI8
::
PerChannelQuantI8
(
const
Tensor
&
x
,
Tensor
x_packed
,
Tensor
x_scale
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
x
,
x_packed
,
x_scale
);
INFINICORE_GRAPH_OP_DISPATCH
(
x
->
device
().
getType
(),
x
,
x_packed
,
x_scale
);
}
void
PerChannelQuantI8
::
execute
(
const
Tensor
&
x
,
Tensor
x_packed
,
Tensor
x_scale
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
PerChannelQuantI8
,
x
,
x_packed
,
x_scale
);
}
void
per_channel_quant_i8_
(
const
Tensor
&
x
,
Tensor
x_packed
,
Tensor
x_scale
)
{
PerChannelQuantI8
::
execute
(
x
,
x_packed
,
x_scale
);
}
}
// namespace infinicore::op
src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
0 → 100644
View file @
784139b9
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
per_channel_quant_i8_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
PerChannelQuantI8
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
x
,
x_packed
,
x_scale
;
};
void
*
plan
(
const
Tensor
&
x
,
Tensor
x_packed
,
Tensor
x_scale
)
{
size_t
seed
=
hash_combine
(
x
,
x_packed
,
x_scale
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
PerChannelQuantI8
,
seed
,
x_packed
->
desc
(),
x_scale
->
desc
(),
nullptr
,
x
->
desc
());
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
PerChannelQuantI8
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
x
),
graph
::
GraphTensor
(
x_packed
),
graph
::
GraphTensor
(
x_scale
)};
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopPerChannelQuantI8
(
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
x_packed
->
data
(),
planned
->
x_scale
->
data
(),
nullptr
,
planned
->
x
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
PerChannelQuantI8
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::per_channel_quant_i8_impl::infiniop
src/infinicore/ops/rearrange/rearrange.cc
View file @
784139b9
...
...
@@ -3,24 +3,30 @@
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Rearrange
::
schema
>
&
Rearrange
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Rearrange
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Rearrange
);
void
Rearrange
::
execut
e
(
Tensor
y
,
Tensor
x
)
{
Rearrange
::
Rearrang
e
(
Tensor
y
,
const
Tensor
&
x
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
y
,
x
);
infinicore
::
context
::
setDevice
(
y
->
device
());
dispatcher
().
lookup
(
y
->
device
().
getType
())(
y
,
x
);
INFINICORE_GRAPH_OP_DISPATCH
(
y
->
device
().
getType
(),
y
,
x
);
}
Tensor
rearrange
(
Tensor
x
)
{
void
Rearrange
::
execute
(
Tensor
y
,
const
Tensor
&
x
)
{
auto
op
=
std
::
make_shared
<
Rearrange
>
(
y
,
x
);
if
(
context
::
isGraphRecording
())
{
context
::
addGraphOperator
(
op
);
}
else
{
op
->
run
();
}
}
Tensor
rearrange
(
const
Tensor
&
x
)
{
auto
y
=
Tensor
::
empty
(
x
->
shape
(),
x
->
dtype
(),
x
->
device
());
rearrange_
(
y
,
x
);
return
y
;
}
void
rearrange_
(
Tensor
y
,
Tensor
x
)
{
void
rearrange_
(
Tensor
y
,
const
Tensor
&
x
)
{
Rearrange
::
execute
(
y
,
x
);
}
}
// namespace infinicore::op
src/infinicore/ops/rearrange/rearrange_infiniop.cc
View file @
784139b9
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rearrange.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
rearrange_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopRearrangeDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopRearrangeDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyRearrangeDescriptor
(
desc
));
desc
=
nullptr
;
}
});
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Rearrange
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
y
,
x
;
};
void
calculate
(
Tensor
y
,
Tensor
x
)
{
void
*
plan
(
Tensor
y
,
const
Tensor
&
x
)
{
size_t
seed
=
hash_combine
(
y
,
x
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
Rearrange
,
seed
,
y
->
desc
(),
x
->
desc
());
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopRearrangeDescriptor_t
desc
=
nullptr
;
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
y
),
graph
::
GraphTensor
(
x
)};
}
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateRearrangeDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
y
->
desc
(),
x
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopRearrange
(
desc
,
y
->
data
(),
x
->
data
(),
planned
->
descriptor
->
desc
,
planned
->
y
->
data
(),
planned
->
x
->
data
(),
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
Rearrange
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Rearrange
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::rearrange_impl::infiniop
src/infinicore/ops/rms_norm/rms_norm.cc
View file @
784139b9
#include "infinicore/ops/rms_norm.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
RMSNorm
);
common
::
OpDispatcher
<
RMSNorm
::
schema
>
&
RMSNorm
::
dispatcher
()
{
static
common
::
OpDispatcher
<
RMSNorm
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
RMSNorm
::
execute
(
Tensor
y
,
Tensor
x
,
Tensor
weight
,
float
epsilon
)
{
RMSNorm
::
RMSNorm
(
Tensor
y
,
const
Tensor
&
x
,
const
Tensor
&
weight
,
float
epsilon
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
y
,
x
,
weight
);
infinicore
::
context
::
setDevice
(
y
->
device
());
dispatcher
().
lookup
(
y
->
device
().
getType
())(
y
,
x
,
weight
,
epsilon
);
INFINICORE_GRAPH_OP_DISPATCH
(
y
->
device
().
getType
(),
y
,
x
,
weight
,
epsilon
);
}
void
RMSNorm
::
execute
(
Tensor
y
,
const
Tensor
&
x
,
const
Tensor
&
weight
,
float
epsilon
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
RMSNorm
,
y
,
x
,
weight
,
epsilon
);
}
Tensor
rms_norm
(
Tensor
x
,
Tensor
weight
,
float
epsilon
)
{
Tensor
rms_norm
(
const
Tensor
&
x
,
const
Tensor
&
weight
,
float
epsilon
)
{
auto
y
=
Tensor
::
empty
(
x
->
shape
(),
x
->
dtype
(),
x
->
device
());
rms_norm_
(
y
,
x
,
weight
,
epsilon
);
return
y
;
}
void
rms_norm_
(
Tensor
y
,
Tensor
x
,
Tensor
weight
,
float
epsilon
)
{
void
rms_norm_
(
Tensor
y
,
const
Tensor
&
x
,
const
Tensor
&
weight
,
float
epsilon
)
{
RMSNorm
::
execute
(
y
,
x
,
weight
,
epsilon
);
}
...
...
src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
View file @
784139b9
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rms_norm.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
rms_norm_impl
::
infiniop
{
#include "../infiniop_impl.hpp"
thread_local
common
::
OpCache
<
size_t
,
infiniopRMSNormDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopRMSNormDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyRMSNormDescriptor
(
desc
));
desc
=
nullptr
;
}
});
namespace
infinicore
::
op
::
rms_norm_impl
::
infiniop
{
void
calculate
(
Tensor
y
,
Tensor
x
,
Tensor
weight
,
float
epsilon
)
{
size_t
seed
=
hash_combine
(
y
,
x
,
weight
,
epsilon
);
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
RMSNorm
,
100
);
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
y
,
x
,
weight
;
};
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopRMSNormDescriptor_t
desc
=
nullptr
;
void
*
plan
(
Tensor
y
,
const
Tensor
&
x
,
const
Tensor
&
weight
,
float
epsilon
)
{
size_t
seed
=
hash_combine
(
y
,
x
,
weight
,
epsilon
)
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateRMSNormDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
y
->
desc
(),
x
->
desc
(),
weight
->
desc
(),
epsilon
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
RMSNorm
,
seed
,
y
->
desc
(),
x
->
desc
(),
weight
->
desc
(),
epsilon
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
RMSNorm
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
y
),
graph
::
GraphTensor
(
x
),
graph
::
GraphTensor
(
weight
)};
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetRMSNormWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopRMSNorm
(
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
y
->
data
(),
planned
->
x
->
data
(),
planned
->
weight
->
data
(),
context
::
getStream
()));
}
INFINICORE_CHECK_ERROR
(
infiniopRMSNorm
(
de
sc
,
workspace
->
data
(),
workspace_size
,
y
->
data
(),
x
->
data
(),
weight
->
data
(),
context
::
getStream
()))
;
void
cleanup
(
void
**
planned_meta_ptr
)
{
de
lete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
RMSNorm
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
RMSNorm
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::rms_norm_impl::infiniop
src/infinicore/ops/rope/rope.cc
View file @
784139b9
#include "infinicore/ops/rope.hpp"
#include "../../utils.hpp"
#include "infinicore/context/context.hpp"
#include <stdexcept>
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
RoPE
::
schema
>
&
RoPE
::
dispatcher
()
{
static
common
::
OpDispatcher
<
RoPE
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
RoPE
);
void
RoPE
::
execute
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
RoPE
::
RoPE
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
x_out
,
x
,
pos
,
sin_table
,
cos_table
);
infinicore
::
context
::
setDevice
(
x_out
->
device
());
auto
device_type
=
x_out
->
device
().
getType
();
auto
func
=
dispatcher
().
lookup
(
device_type
);
if
(
func
==
nullptr
)
{
throw
std
::
runtime_error
(
"No RoPE implementation found for device type: "
+
std
::
to_string
(
static_cast
<
int
>
(
device_type
)));
}
INFINICORE_GRAPH_OP_DISPATCH
(
x_out
->
device
().
getType
(),
x_out
,
x
,
pos
,
sin_table
,
cos_table
,
algo
);
}
func
(
x_out
,
x
,
pos
,
sin_table
,
cos_table
,
algo
);
void
RoPE
::
execute
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
RoPE
,
x_out
,
x
,
pos
,
sin_table
,
cos_table
,
algo
);
}
void
rope_
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
void
rope_
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
RoPE
::
execute
(
x_out
,
x
,
pos
,
sin_table
,
cos_table
,
algo
);
}
Tensor
rope
(
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
Shape
shape
=
x
->
shape
();
auto
x_out
=
Tensor
::
empty
(
shape
,
x
->
dtype
(),
x
->
device
());
Tensor
rope
(
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_table
,
const
Tensor
&
cos_table
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
auto
x_out
=
Tensor
::
empty
(
x
->
shape
(),
x
->
dtype
(),
x
->
device
());
rope_
(
x_out
,
x
,
pos
,
sin_table
,
cos_table
,
algo
);
return
x_out
;
}
...
...
src/infinicore/ops/rope/rope_infiniop.cc
View file @
784139b9
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rope.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
rope_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopRoPEDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopRoPEDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyRoPEDescriptor
(
desc
));
desc
=
nullptr
;
}
});
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
RoPE
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
;
graph
::
GraphTensor
x_out
;
graph
::
GraphTensor
x
;
graph
::
GraphTensor
pos
;
graph
::
GraphTensor
sin
;
graph
::
GraphTensor
cos
;
};
void
calculate
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin_cache
,
const
Tensor
&
cos_cache
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
// Convert infinicore::nn::RoPE::Algo to infiniopRoPEAlgo_t
infiniopRoPEAlgo_t
infiniop_algo
;
static
infiniopRoPEAlgo_t
to_infiniop_algo
(
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
switch
(
algo
)
{
case
infinicore
::
nn
::
RoPE
::
Algo
::
GPT_J
:
infiniop_algo
=
INFINIOP_ROPE_ALGO_GPT_J
;
break
;
return
INFINIOP_ROPE_ALGO_GPT_J
;
case
infinicore
::
nn
::
RoPE
::
Algo
::
GPT_NEOX
:
infiniop_algo
=
INFINIOP_ROPE_ALGO_GPT_NEOX
;
break
;
return
INFINIOP_ROPE_ALGO_GPT_NEOX
;
default:
throw
std
::
runtime_error
(
"Unsupported RoPE algorithm
: "
+
std
::
to_string
(
static_cast
<
int
>
(
algo
))
);
throw
std
::
runtime_error
(
"Unsupported RoPE algorithm
"
);
}
}
// Create hash key for descriptor caching
size_t
key
=
hash_combine
(
x_out
,
x
,
pos
,
sin_cache
,
cos_cache
);
hash_combine
(
key
,
std
::
hash
<
int
>
()(
static_cast
<
int
>
(
infiniop_algo
)));
void
*
plan
(
Tensor
x_out
,
const
Tensor
&
x
,
const
Tensor
&
pos
,
const
Tensor
&
sin
,
const
Tensor
&
cos
,
infinicore
::
nn
::
RoPE
::
Algo
algo
)
{
auto
infiniop_algo
=
to_infiniop_algo
(
algo
);
size_t
key
=
hash_combine
(
x_out
,
x
,
pos
,
sin
,
cos
,
static_cast
<
int
>
(
infiniop_algo
));
auto
device
=
context
::
getDevice
();
auto
&
cache
=
caches
.
getCache
(
device
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
RoPE
,
key
,
x_out
->
desc
(),
x
->
desc
(),
pos
->
desc
(),
sin
->
desc
(),
cos
->
desc
(),
infiniop_algo
);
auto
desc_opt
=
cache
.
get
(
key
);
infiniopRoPEDescriptor_t
desc
=
nullptr
;
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
RoPE
,
descriptor
);
return
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
x_out
),
graph
::
GraphTensor
(
x
),
graph
::
GraphTensor
(
pos
),
graph
::
GraphTensor
(
sin
),
graph
::
GraphTensor
(
cos
)};
}
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateRoPEDescriptor
(
context
::
getInfiniopHandle
(
device
),
&
desc
,
x_out
->
desc
(),
x
->
desc
(),
pos
->
desc
(),
sin_cache
->
desc
(),
cos_cache
->
desc
(),
infiniop_algo
));
cache
.
put
(
key
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
void
run
(
void
*
planned_meta
)
{
auto
*
p
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetRoPEWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopRoPE
(
p
->
descriptor
->
desc
,
p
->
workspace
->
data
(),
p
->
workspace
->
numel
(),
p
->
x_out
->
data
(),
p
->
x
->
data
(),
p
->
pos
->
data
(),
p
->
sin
->
data
(),
p
->
cos
->
data
(),
context
::
getStream
()));
}
// InfiniOP reads from x and writes to x_out (handles copying internally)
INFINICORE_CHECK_ERROR
(
infiniopRoPE
(
desc
,
workspace
->
data
(),
workspace_size
,
x_out
->
data
(),
x
->
data
(),
pos
->
data
(),
sin_cache
->
data
(),
cos_cache
->
data
(),
context
::
getStream
()));
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
static
bool
registered
=
[]()
{
RoPE
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
RoPE
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::rope_impl::infiniop
Prev
1
2
3
4
5
6
7
8
9
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment