Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8d09630a
Unverified
Commit
8d09630a
authored
Feb 11, 2026
by
gongchensu
Committed by
GitHub
Feb 11, 2026
Browse files
Merge branch 'demo131' into Issue/862
parents
ab52dead
012df56c
Changes
387
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
669 additions
and
150 deletions
+669
-150
src/infinicore/context/allocators/stream_ordered_allocator.hpp
...nfinicore/context/allocators/stream_ordered_allocator.hpp
+3
-3
src/infinicore/context/context_impl.cc
src/infinicore/context/context_impl.cc
+33
-10
src/infinicore/context/context_impl.hpp
src/infinicore/context/context_impl.hpp
+0
-2
src/infinicore/context/internal.hpp
src/infinicore/context/internal.hpp
+10
-0
src/infinicore/context/runtime/runtime.cc
src/infinicore/context/runtime/runtime.cc
+33
-4
src/infinicore/context/runtime/runtime.hpp
src/infinicore/context/runtime/runtime.hpp
+13
-2
src/infinicore/device.cc
src/infinicore/device.cc
+2
-0
src/infinicore/graph/graph.cc
src/infinicore/graph/graph.cc
+153
-0
src/infinicore/graph/graph_manager.hpp
src/infinicore/graph/graph_manager.hpp
+25
-0
src/infinicore/nn/embedding.cc
src/infinicore/nn/embedding.cc
+7
-0
src/infinicore/nn/linear.cc
src/infinicore/nn/linear.cc
+154
-27
src/infinicore/nn/rmsnorm.cc
src/infinicore/nn/rmsnorm.cc
+19
-0
src/infinicore/nn/rope.cc
src/infinicore/nn/rope.cc
+23
-5
src/infinicore/ops/add/add.cc
src/infinicore/ops/add/add.cc
+9
-9
src/infinicore/ops/add/add_infiniop.cc
src/infinicore/ops/add/add_infiniop.cc
+36
-34
src/infinicore/ops/add_rms_norm/add_rms_norm.cc
src/infinicore/ops/add_rms_norm/add_rms_norm.cc
+33
-0
src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc
src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc
+53
-0
src/infinicore/ops/causal_softmax/causal_softmax.cc
src/infinicore/ops/causal_softmax/causal_softmax.cc
+10
-20
src/infinicore/ops/causal_softmax/causal_softmax_infiniop.cc
src/infinicore/ops/causal_softmax/causal_softmax_infiniop.cc
+33
-34
src/infinicore/ops/dequantize_awq/dequantize_awq.cc
src/infinicore/ops/dequantize_awq/dequantize_awq.cc
+20
-0
No files found.
src/infinicore/context/allocators/
device_caching
_allocator.hpp
→
src/infinicore/context/allocators/
stream_ordered
_allocator.hpp
View file @
8d09630a
...
@@ -5,10 +5,10 @@
...
@@ -5,10 +5,10 @@
#include "../context_impl.hpp"
#include "../context_impl.hpp"
namespace
infinicore
{
namespace
infinicore
{
class
DeviceCaching
Allocator
:
public
MemoryAllocator
{
class
StreamOrdered
Allocator
:
public
MemoryAllocator
{
public:
public:
explicit
DeviceCaching
Allocator
(
Device
device
);
explicit
StreamOrdered
Allocator
(
Device
device
);
~
DeviceCaching
Allocator
()
=
default
;
~
StreamOrdered
Allocator
()
=
default
;
std
::
byte
*
allocate
(
size_t
size
)
override
;
std
::
byte
*
allocate
(
size_t
size
)
override
;
void
deallocate
(
std
::
byte
*
ptr
)
override
;
void
deallocate
(
std
::
byte
*
ptr
)
override
;
...
...
src/infinicore/context/context_impl.cc
View file @
8d09630a
#include "context_impl.hpp"
#include "context_impl.hpp"
#include "internal.hpp"
#include "../utils.hpp"
#include "../utils.hpp"
...
@@ -29,16 +30,18 @@ Runtime *ContextImpl::getCurrentRuntime() {
...
@@ -29,16 +30,18 @@ Runtime *ContextImpl::getCurrentRuntime() {
return
current_runtime_
;
return
current_runtime_
;
}
}
Runtime
*
ContextImpl
::
getCpuRuntime
()
{
return
runtime_table_
[
int
(
Device
::
Type
::
CPU
)][
0
].
get
();
}
void
ContextImpl
::
setDevice
(
Device
device
)
{
void
ContextImpl
::
setDevice
(
Device
device
)
{
if
(
device
==
getCurrentRuntime
()
->
device
())
{
if
(
device
==
getCurrentRuntime
()
->
device
())
{
// Do nothing if the device is already set.
// Do nothing if the device is already set.
return
;
return
;
}
}
thread_local
bool
warn_switch_runtime
=
false
;
if
(
getCurrentRuntime
()
->
isGraphRecording
()
&&
!
warn_switch_runtime
)
{
spdlog
::
warn
(
"Switching device runtime during graph recording may break the graph!"
);
warn_switch_runtime
=
true
;
}
if
(
runtime_table_
[
int
(
device
.
getType
())][
device
.
getIndex
()]
==
nullptr
)
{
if
(
runtime_table_
[
int
(
device
.
getType
())][
device
.
getIndex
()]
==
nullptr
)
{
// Lazy initialization of runtime if never set before.
// Lazy initialization of runtime if never set before.
runtime_table_
[
int
(
device
.
getType
())][
device
.
getIndex
()]
=
std
::
unique_ptr
<
Runtime
>
(
new
Runtime
(
device
));
runtime_table_
[
int
(
device
.
getType
())][
device
.
getIndex
()]
=
std
::
unique_ptr
<
Runtime
>
(
new
Runtime
(
device
));
...
@@ -100,11 +103,8 @@ infinirtStream_t getStream() {
...
@@ -100,11 +103,8 @@ infinirtStream_t getStream() {
}
}
infiniopHandle_t
getInfiniopHandle
(
Device
device
)
{
infiniopHandle_t
getInfiniopHandle
(
Device
device
)
{
if
(
device
.
getType
()
==
Device
::
Type
::
CPU
)
{
return
ContextImpl
::
singleton
().
getCpuRuntime
()
->
infiniopHandle
();
}
if
(
device
!=
getDevice
())
{
if
(
device
!=
getDevice
())
{
throw
std
::
runtime_error
(
"Requested device doesn't match current runtime."
);
setDevice
(
device
);
}
}
return
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
infiniopHandle
();
return
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
infiniopHandle
();
}
}
...
@@ -122,7 +122,8 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {
...
@@ -122,7 +122,8 @@ std::shared_ptr<Memory> allocateMemory(size_t size) {
}
}
std
::
shared_ptr
<
Memory
>
allocateHostMemory
(
size_t
size
)
{
std
::
shared_ptr
<
Memory
>
allocateHostMemory
(
size_t
size
)
{
return
ContextImpl
::
singleton
().
getCpuRuntime
()
->
allocateMemory
(
size
);
setDevice
(
Device
::
cpu
());
return
allocateMemory
(
size
);
}
}
std
::
shared_ptr
<
Memory
>
allocatePinnedHostMemory
(
size_t
size
)
{
std
::
shared_ptr
<
Memory
>
allocatePinnedHostMemory
(
size_t
size
)
{
...
@@ -142,7 +143,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) {
...
@@ -142,7 +143,8 @@ void memcpyD2D(void *dst, const void *src, size_t size, bool async) {
}
}
void
memcpyH2H
(
void
*
dst
,
const
void
*
src
,
size_t
size
)
{
void
memcpyH2H
(
void
*
dst
,
const
void
*
src
,
size_t
size
)
{
return
ContextImpl
::
singleton
().
getCpuRuntime
()
->
memcpyD2D
(
dst
,
src
,
size
);
setDevice
(
Device
::
cpu
());
return
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
memcpyD2D
(
dst
,
src
,
size
);
}
}
// Timing API implementations
// Timing API implementations
...
@@ -178,6 +180,27 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
...
@@ -178,6 +180,27 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
streamWaitEvent
(
stream
,
event
);
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
streamWaitEvent
(
stream
,
event
);
}
}
bool
isGraphRecording
()
{
return
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
isGraphRecording
();
}
void
startGraphRecording
()
{
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
startGraphRecording
();
}
void
addGraphOperator
(
std
::
shared_ptr
<
graph
::
GraphOperator
>
op
)
{
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
addGraphOperator
(
op
);
}
std
::
shared_ptr
<
graph
::
Graph
>
stopGraphRecording
()
{
return
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
stopGraphRecording
();
}
std
::
shared_ptr
<
Memory
>
reinstantiateBlob
(
std
::
shared_ptr
<
Memory
>
blob
)
{
setDevice
(
blob
->
device
());
return
ContextImpl
::
singleton
().
getCurrentRuntime
()
->
reinstantiateBlob
(
blob
);
}
}
// namespace context
}
// namespace context
}
// namespace infinicore
}
// namespace infinicore
src/infinicore/context/context_impl.hpp
View file @
8d09630a
...
@@ -19,8 +19,6 @@ protected:
...
@@ -19,8 +19,6 @@ protected:
public:
public:
Runtime
*
getCurrentRuntime
();
Runtime
*
getCurrentRuntime
();
Runtime
*
getCpuRuntime
();
void
setDevice
(
Device
);
void
setDevice
(
Device
);
size_t
getDeviceCount
(
Device
::
Type
type
);
size_t
getDeviceCount
(
Device
::
Type
type
);
...
...
src/infinicore/context/internal.hpp
0 → 100644
View file @
8d09630a
#pragma once
#include "infinicore/device.hpp"
#include "infinicore/memory.hpp"
#include "infinicore/graph/graph.hpp"
namespace
infinicore
::
context
{
std
::
shared_ptr
<
Memory
>
reinstantiateBlob
(
std
::
shared_ptr
<
Memory
>
blob
);
};
src/infinicore/context/runtime/runtime.cc
View file @
8d09630a
...
@@ -2,19 +2,20 @@
...
@@ -2,19 +2,20 @@
#include "../../utils.hpp"
#include "../../utils.hpp"
#include "../allocators/device_caching_allocator.hpp"
#include "../allocators/device_pinned_allocator.hpp"
#include "../allocators/device_pinned_allocator.hpp"
#include "../allocators/host_allocator.hpp"
#include "../allocators/host_allocator.hpp"
#include "../allocators/pinnable_block_allocator.hpp"
#include "../allocators/stream_ordered_allocator.hpp"
namespace
infinicore
{
namespace
infinicore
{
Runtime
::
Runtime
(
Device
device
)
:
device_
(
device
)
{
Runtime
::
Runtime
(
Device
device
)
:
device_
(
device
)
,
graph_manager_
(
std
::
make_unique
<
graph
::
GraphManager
>
())
{
activate
();
activate
();
INFINICORE_CHECK_ERROR
(
infinirtStreamCreate
(
&
stream_
));
INFINICORE_CHECK_ERROR
(
infinirtStreamCreate
(
&
stream_
));
INFINICORE_CHECK_ERROR
(
infiniopCreateHandle
(
&
infiniop_handle_
));
INFINICORE_CHECK_ERROR
(
infiniopCreateHandle
(
&
infiniop_handle_
));
if
(
device_
.
getType
()
==
Device
::
Type
::
CPU
)
{
if
(
device_
.
getType
()
==
Device
::
Type
::
CPU
)
{
device_memory_allocator_
=
std
::
make_unique
<
Host
Allocator
>
();
device_memory_allocator_
=
std
::
make_unique
<
PinnableBlock
Allocator
>
(
device
);
}
else
{
}
else
{
device_memory_allocator_
=
std
::
make_unique
<
DeviceCaching
Allocator
>
(
device
);
device_memory_allocator_
=
std
::
make_unique
<
PinnableBlock
Allocator
>
(
device
);
pinned_host_memory_allocator_
=
std
::
make_unique
<
DevicePinnedHostAllocator
>
(
device
);
pinned_host_memory_allocator_
=
std
::
make_unique
<
DevicePinnedHostAllocator
>
(
device
);
}
}
}
}
...
@@ -76,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
...
@@ -76,6 +77,15 @@ std::shared_ptr<Memory> Runtime::allocatePinnedHostMemory(size_t size) {
true
);
true
);
}
}
std
::
shared_ptr
<
Memory
>
Runtime
::
reinstantiateBlob
(
std
::
shared_ptr
<
Memory
>
blob
)
{
device_memory_allocator_
.
get
()
->
mark_in_use_
(
blob
->
data
(),
true
);
return
std
::
make_shared
<
Memory
>
(
blob
->
data
(),
blob
->
size
(),
device_
,
[
alloc
=
device_memory_allocator_
.
get
()](
std
::
byte
*
p
)
{
alloc
->
deallocate
(
p
);
});
}
void
Runtime
::
memcpyH2D
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
bool
async
)
{
void
Runtime
::
memcpyH2D
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
bool
async
)
{
if
(
async
)
{
if
(
async
)
{
INFINICORE_CHECK_ERROR
(
infinirtMemcpyAsync
(
dst
,
src
,
size
,
INFINIRT_MEMCPY_H2D
,
stream_
));
INFINICORE_CHECK_ERROR
(
infinirtMemcpyAsync
(
dst
,
src
,
size
,
INFINIRT_MEMCPY_H2D
,
stream_
));
...
@@ -144,6 +154,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
...
@@ -144,6 +154,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
INFINICORE_CHECK_ERROR
(
infinirtStreamWaitEvent
(
stream
,
event
));
INFINICORE_CHECK_ERROR
(
infinirtStreamWaitEvent
(
stream
,
event
));
}
}
bool
Runtime
::
isGraphRecording
()
const
{
return
graph_manager_
->
is_recording
();
}
void
Runtime
::
startGraphRecording
()
{
device_memory_allocator_
->
set_pin_mode
(
true
);
return
graph_manager_
->
start_recording
();
}
void
Runtime
::
addGraphOperator
(
std
::
shared_ptr
<
graph
::
GraphOperator
>
op
)
{
return
graph_manager_
->
add_operator
(
op
);
}
std
::
shared_ptr
<
graph
::
Graph
>
Runtime
::
stopGraphRecording
()
{
auto
graph
=
graph_manager_
->
stop_recording
();
device_memory_allocator_
->
set_pin_mode
(
false
);
return
graph
;
}
std
::
string
Runtime
::
toString
()
const
{
std
::
string
Runtime
::
toString
()
const
{
return
fmt
::
format
(
"Runtime({})"
,
device_
.
toString
());
return
fmt
::
format
(
"Runtime({})"
,
device_
.
toString
());
}
}
...
...
src/infinicore/context/runtime/runtime.hpp
View file @
8d09630a
#pragma once
#pragma once
#include "../allocators/memory_allocator.hpp"
#include "../allocators/pinnable_block_allocator.hpp"
#include "infinicore/context/context.hpp"
#include "infinicore/context/context.hpp"
#include "../../graph/graph_manager.hpp"
#include <infiniop.h>
#include <infiniop.h>
#include <infinirt.h>
#include <infinirt.h>
...
@@ -13,8 +16,9 @@ private:
...
@@ -13,8 +16,9 @@ private:
Device
device_
;
Device
device_
;
infinirtStream_t
stream_
;
infinirtStream_t
stream_
;
infiniopHandle_t
infiniop_handle_
;
infiniopHandle_t
infiniop_handle_
;
std
::
unique_ptr
<
Memory
Allocator
>
device_memory_allocator_
;
std
::
unique_ptr
<
PinnableBlock
Allocator
>
device_memory_allocator_
;
std
::
unique_ptr
<
MemoryAllocator
>
pinned_host_memory_allocator_
;
std
::
unique_ptr
<
MemoryAllocator
>
pinned_host_memory_allocator_
;
std
::
unique_ptr
<
graph
::
GraphManager
>
graph_manager_
;
protected:
protected:
Runtime
(
Device
device
);
Runtime
(
Device
device
);
...
@@ -33,6 +37,7 @@ public:
...
@@ -33,6 +37,7 @@ public:
std
::
shared_ptr
<
Memory
>
allocateMemory
(
size_t
size
);
std
::
shared_ptr
<
Memory
>
allocateMemory
(
size_t
size
);
std
::
shared_ptr
<
Memory
>
allocatePinnedHostMemory
(
size_t
size
);
std
::
shared_ptr
<
Memory
>
allocatePinnedHostMemory
(
size_t
size
);
std
::
shared_ptr
<
Memory
>
reinstantiateBlob
(
std
::
shared_ptr
<
Memory
>
blob
);
void
memcpyH2D
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
bool
async
=
true
);
void
memcpyH2D
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
bool
async
=
true
);
void
memcpyD2H
(
void
*
dst
,
const
void
*
src
,
size_t
size
);
void
memcpyD2H
(
void
*
dst
,
const
void
*
src
,
size_t
size
);
...
@@ -48,6 +53,12 @@ public:
...
@@ -48,6 +53,12 @@ public:
float
elapsedTime
(
infinirtEvent_t
start
,
infinirtEvent_t
end
);
float
elapsedTime
(
infinirtEvent_t
start
,
infinirtEvent_t
end
);
void
streamWaitEvent
(
infinirtStream_t
stream
,
infinirtEvent_t
event
);
void
streamWaitEvent
(
infinirtStream_t
stream
,
infinirtEvent_t
event
);
// Graph
bool
isGraphRecording
()
const
;
void
startGraphRecording
();
void
addGraphOperator
(
std
::
shared_ptr
<
graph
::
GraphOperator
>
op
);
std
::
shared_ptr
<
graph
::
Graph
>
stopGraphRecording
();
std
::
string
toString
()
const
;
std
::
string
toString
()
const
;
friend
class
ContextImpl
;
friend
class
ContextImpl
;
...
...
src/infinicore/device.cc
View file @
8d09630a
...
@@ -41,6 +41,8 @@ std::string Device::toString(const Type &type) {
...
@@ -41,6 +41,8 @@ std::string Device::toString(const Type &type) {
return
"KUNLUN"
;
return
"KUNLUN"
;
case
Type
::
HYGON
:
case
Type
::
HYGON
:
return
"HYGON"
;
return
"HYGON"
;
case
Type
::
ALI
:
return
"ALI"
;
case
Type
::
COUNT
:
case
Type
::
COUNT
:
return
"COUNT"
;
return
"COUNT"
;
default:
default:
...
...
src/infinicore/graph/graph.cc
0 → 100644
View file @
8d09630a
#include "graph_manager.hpp"
#include "../utils.hpp"
#include "infinicore/context/context.hpp"
#include <infinirt.h>
namespace
infinicore
::
graph
{
/* =========================
* GraphTensor
* ========================= */
GraphTensor
::
GraphTensor
(
const
Tensor
&
tensor
)
:
Tensor
(
tensor
->
to_blob_
())
{
}
/* =========================
* GraphOperator
* ========================= */
void
DispatchableGraphOperator
::
run
()
const
{
runner_
(
planned_meta_
);
}
DispatchableGraphOperator
::~
DispatchableGraphOperator
()
{
if
(
deleter_
)
{
deleter_
(
&
planned_meta_
);
}
}
/* =========================
* Graph
* ========================= */
struct
Graph
::
DeviceGraph
{
infinirtGraph_t
graph
;
infinirtGraphExec_t
exec
;
infinirtGraphNode_t
node
;
std
::
vector
<
char
>
log_buffer
;
DeviceGraph
()
{
log_buffer
.
resize
(
4
*
1024
);
}
~
DeviceGraph
()
{
if
(
exec
)
{
infinirtGraphExecDestroy
(
exec
);
}
if
(
graph
)
{
infinirtGraphDestroy
(
graph
);
}
}
void
launch
()
{
INFINICORE_CHECK_ERROR
(
infinirtGraphLuanch
(
exec
,
context
::
getStream
()));
}
};
Graph
::
Graph
()
{
}
void
Graph
::
run
()
const
{
if
(
device_graph_
!=
nullptr
&&
device_graph_
.
get
()
->
exec
!=
nullptr
)
{
device_graph_
.
get
()
->
launch
();
}
else
{
for
(
auto
&
op
:
op_list_
)
{
op
->
run
();
}
}
}
void
Graph
::
add_operator
(
std
::
shared_ptr
<
GraphOperator
>
op
)
{
op_list_
.
push_back
(
op
);
}
void
Graph
::
instantiate
()
{
// Reset device graph
device_graph_
=
std
::
make_unique
<
DeviceGraph
>
();
// warmup
for
(
size_t
iter
=
0
;
iter
<
5
;
++
iter
)
{
this
->
run
();
}
infinicore
::
context
::
syncStream
();
if
(
infinirtStreamBeginCapture
(
context
::
getStream
(),
INFINIRT_STREAM_CAPTURE_MODE_RELAXED
)
!=
INFINI_STATUS_SUCCESS
)
{
return
;
}
// Run and record
this
->
run
();
if
(
infinirtStreamEndCapture
(
context
::
getStream
(),
&
device_graph_
.
get
()
->
graph
)
!=
INFINI_STATUS_SUCCESS
)
{
return
;
}
if
(
infinirtGraphInstantiate
(
&
device_graph_
.
get
()
->
exec
,
device_graph_
.
get
()
->
graph
,
&
device_graph_
.
get
()
->
node
,
device_graph_
.
get
()
->
log_buffer
.
data
(),
device_graph_
.
get
()
->
log_buffer
.
size
())
!=
INFINI_STATUS_SUCCESS
)
{
static
bool
warned_once
=
false
;
if
(
!
warned_once
)
{
warned_once
=
true
;
spdlog
::
warn
(
"Fail to instantiate device graph: {}"
,
std
::
string
(
device_graph_
.
get
()
->
log_buffer
.
data
()));
}
}
}
Graph
::~
Graph
()
=
default
;
/* =========================
* GraphManager
* ========================= */
bool
GraphManager
::
is_recording
()
const
{
return
recording_
;
}
void
GraphManager
::
start_recording
()
{
if
(
is_recording
())
{
spdlog
::
warn
(
"Graph is already recording. Previous recording will be dropped."
);
}
recording_
=
true
;
graph_
=
std
::
make_shared
<
Graph
>
();
}
void
GraphManager
::
add_operator
(
std
::
shared_ptr
<
GraphOperator
>
op
)
{
INFINICORE_ASSERT
(
is_recording
());
graph_
->
add_operator
(
op
);
}
std
::
shared_ptr
<
Graph
>
GraphManager
::
stop_recording
()
{
if
(
!
is_recording
())
{
spdlog
::
warn
(
"Graph is not recording. Please start recording first."
);
return
nullptr
;
}
recording_
=
false
;
#ifdef USE_INFINIRT_GRAPH
graph_
->
instantiate
();
#endif
return
std
::
exchange
(
graph_
,
nullptr
);
}
}
// namespace infinicore::graph
src/infinicore/graph/graph_manager.hpp
0 → 100644
View file @
8d09630a
#pragma once
#include "infinicore/graph/graph.hpp"
#include <memory>
#include <vector>
namespace
infinicore
::
graph
{
class
GraphManager
{
public:
GraphManager
()
=
default
;
~
GraphManager
()
=
default
;
bool
is_recording
()
const
;
void
start_recording
();
void
add_operator
(
std
::
shared_ptr
<
GraphOperator
>
op
);
std
::
shared_ptr
<
Graph
>
stop_recording
();
private:
std
::
shared_ptr
<
Graph
>
graph_
;
bool
recording_
=
false
;
};
}
// namespace infinicore::graph
src/infinicore/nn/embedding.cc
View file @
8d09630a
...
@@ -43,6 +43,13 @@ Embedding::Embedding(size_t num_embeddings,
...
@@ -43,6 +43,13 @@ Embedding::Embedding(size_t num_embeddings,
}
}
Tensor
Embedding
::
forward
(
const
Tensor
&
indices
)
const
{
Tensor
Embedding
::
forward
(
const
Tensor
&
indices
)
const
{
// TODO: Implement on-device embedding for all devices, then remove the condition and the classic approach
auto
device_type
=
device_
.
getType
();
if
(
device_type
==
Device
::
Type
::
NVIDIA
||
device_type
==
Device
::
Type
::
ILUVATAR
||
device_type
==
Device
::
Type
::
METAX
||
device_type
==
Device
::
Type
::
MOORE
||
device_type
==
Device
::
Type
::
ALI
)
{
// Use op::embedding which supports device-side input and batch dimension
return
op
::
embedding
(
indices
->
contiguous
()
->
to
(
device_
),
weight_
);
}
// Get the shape of indices
// Get the shape of indices
auto
indices_shape
=
indices
->
shape
();
auto
indices_shape
=
indices
->
shape
();
...
...
src/infinicore/nn/linear.cc
View file @
8d09630a
#include "infinicore/nn/linear.hpp"
#include "infinicore/nn/linear.hpp"
#include "../utils.hpp"
#include "../utils.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/ops/distributed/allreduce.hpp"
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/linear_w8a8i8.hpp"
#include <optional>
#include <optional>
#include <spdlog/spdlog.h>
#include <spdlog/spdlog.h>
...
@@ -17,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
...
@@ -17,21 +19,46 @@ BaseLinear::BaseLinear(size_t in_features, size_t out_features, bool bias,
device_
=
device
;
device_
=
device
;
}
}
Tensor
BaseLinear
::
compute_linear
(
Tensor
&
input
)
const
{
BaseLinear
::
BaseLinear
(
size_t
in_features
,
size_t
out_features
,
std
::
shared_ptr
<
infinicore
::
quantization
::
BaseQuantization
>
quantization
,
bool
bias
,
const
DataType
&
dtype
,
const
Device
&
device
)
:
in_features_
(
in_features
),
out_features_
(
out_features
),
quantization_
(
quantization
),
has_bias_
(
bias
),
dtype_
(
dtype
)
{
device_
=
device
;
}
// Ensure input is contiguous before creating views (required for matmul)
Tensor
BaseLinear
::
compute_linear
(
Tensor
&
input
)
const
{
// This prevents hanging when input tensor has non-contiguous memory layout
switch
(
this
->
quantization_
->
get_quant_scheme
())
{
Tensor
input_contiguous
=
input
->
is_contiguous
()
?
input
:
input
->
contiguous
();
case
infinicore
::
quantization
::
QuantScheme
::
COMPRESSED_TENSOR_W8A8I8
:
{
Tensor
input_contiguous
=
input
->
is_contiguous
()
?
input
:
input
->
contiguous
();
// Use ops::linear_ directly to match Python backend's exact code path
Tensor
weight_packed_tensor
=
static_cast
<
const
Tensor
&>
(
weight_
);
// This ensures identical computation and numerical results
Tensor
weight_scale_tensor
=
static_cast
<
const
Tensor
&>
(
weight_scale_
);
// Parameter inherits from Tensor, so we cast to Tensor explicitly
// weight_packed should be transposed and non-contiguous.
Tensor
weight_tensor
=
static_cast
<
const
Tensor
&>
(
weight_
);
std
::
optional
<
Tensor
>
bias_opt
=
has_bias_
?
std
::
make_optional
<
Tensor
>
(
static_cast
<
const
Tensor
&>
(
bias_
))
:
std
::
nullopt
;
std
::
optional
<
Tensor
>
bias_opt
=
has_bias_
?
std
::
make_optional
<
Tensor
>
(
static_cast
<
const
Tensor
&>
(
bias_
))
:
std
::
nullopt
;
auto
output
=
infinicore
::
op
::
linear
(
input_contiguous
->
contiguous
(),
weight_tensor
->
contiguous
(),
bias_opt
);
auto
output
=
infinicore
::
op
::
linear_w8a8i8
(
input_contiguous
->
contiguous
(),
weight_packed_tensor
,
weight_scale_tensor
,
bias_opt
);
return
output
;
return
output
;
}
}
default:
{
// Ensure input is contiguous before creating views (required for matmul)
// This prevents hanging when input tensor has non-contiguous memory layout
Tensor
input_contiguous
=
input
->
is_contiguous
()
?
input
:
input
->
contiguous
();
// Use ops::linear_ directly to match Python backend's exact code path
// This ensures identical computation and numerical results
// Parameter inherits from Tensor, so we cast to Tensor explicitly
Tensor
weight_tensor
=
static_cast
<
const
Tensor
&>
(
weight_
);
std
::
optional
<
Tensor
>
bias_opt
=
has_bias_
?
std
::
make_optional
<
Tensor
>
(
static_cast
<
const
Tensor
&>
(
bias_
))
:
std
::
nullopt
;
auto
output
=
infinicore
::
op
::
linear
(
input_contiguous
->
contiguous
(),
weight_tensor
->
contiguous
(),
bias_opt
);
return
output
;
}
}
}
// namespace infinicore::nn
Tensor
BaseLinear
::
forward
(
Tensor
&
input
)
const
{
Tensor
BaseLinear
::
forward
(
Tensor
&
input
)
const
{
return
compute_linear
(
input
);
return
compute_linear
(
input
);
...
@@ -70,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
...
@@ -70,6 +97,43 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias,
// in_features, out_features, bias, static_cast<int>(dtype_));
// in_features, out_features, bias, static_cast<int>(dtype_));
}
}
Linear
::
Linear
(
size_t
in_features
,
size_t
out_features
,
std
::
shared_ptr
<
infinicore
::
quantization
::
BaseQuantization
>
quantization
,
bool
bias
,
const
DataType
&
dtype
,
const
Device
&
device
)
:
BaseLinear
(
in_features
,
out_features
,
quantization
,
bias
,
dtype
,
device_
)
{
device_
=
device
;
switch
(
this
->
quantization_
->
get_quant_scheme
())
{
case
infinicore
::
quantization
::
QuantScheme
::
COMPRESSED_TENSOR_W8A8I8
:
{
INFINICORE_NN_PARAMETER_INIT
(
weight
,
({
out_features
,
in_features
},
infinicore
::
DataType
::
I8
,
device
));
INFINICORE_NN_PARAMETER_INIT
(
weight_scale
,
({
out_features
,
1
},
infinicore
::
DataType
::
F32
,
device
));
if
(
bias
)
{
INFINICORE_NN_PARAMETER_INIT
(
bias
,
({
out_features
},
dtype_
,
device
));
}
else
{
bias_
=
Parameter
();
}
break
;
}
default:
{
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT
(
weight
,
({
out_features
,
in_features
},
dtype_
,
device
));
// Register bias parameter if requested
if
(
bias
)
{
INFINICORE_NN_PARAMETER_INIT
(
bias
,
({
out_features
},
dtype_
,
device
));
}
else
{
bias_
=
Parameter
();
// Default constructed empty parameter
}
// SPDLOG_DEBUG("Created Linear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break
;
}
}
}
Tensor
Linear
::
forward
(
Tensor
&
input
)
const
{
Tensor
Linear
::
forward
(
Tensor
&
input
)
const
{
return
BaseLinear
::
forward
(
input
);
return
BaseLinear
::
forward
(
input
);
}
}
...
@@ -102,9 +166,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
...
@@ -102,9 +166,45 @@ ColumnParallelLinear::ColumnParallelLinear(size_t in_features, size_t out_featur
}
else
{
}
else
{
bias_
=
Parameter
();
// Default constructed empty parameter
bias_
=
Parameter
();
// Default constructed empty parameter
}
}
}
// SPDLOG_DEBUG("Created ColumnParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
ColumnParallelLinear
::
ColumnParallelLinear
(
size_t
in_features
,
size_t
out_features
,
std
::
shared_ptr
<
infinicore
::
quantization
::
BaseQuantization
>
quantization
,
bool
bias
,
// in_features, out_features, bias, static_cast<int>(dtype_));
const
DataType
&
dtype
,
const
Device
&
device
,
Size
tp_rank
,
Size
tp_size
)
:
BaseLinear
(
in_features
,
out_features
,
quantization
,
bias
,
dtype
,
device_
),
tp_rank_
(
tp_rank
),
tp_size_
(
tp_size
)
{
device_
=
device
;
switch
(
this
->
quantization_
->
get_quant_scheme
())
{
case
infinicore
::
quantization
::
QuantScheme
::
COMPRESSED_TENSOR_W8A8I8
:
{
INFINICORE_NN_PARAMETER_INIT
(
weight
,
({
out_features
,
in_features
},
infinicore
::
DataType
::
I8
,
device
,
0
,
tp_rank_
,
tp_size_
));
INFINICORE_NN_PARAMETER_INIT
(
weight_scale
,
({
out_features
,
1
},
infinicore
::
DataType
::
F32
,
device
,
0
,
tp_rank_
,
tp_size_
));
if
(
bias
)
{
INFINICORE_NN_PARAMETER_INIT
(
bias
,
({
out_features
},
dtype_
,
device
,
0
,
0
,
1
));
}
else
{
bias_
=
Parameter
();
}
break
;
}
default:
{
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT
(
weight
,
({
out_features
,
in_features
},
dtype_
,
device
,
0
,
tp_rank_
,
tp_size_
));
// Register bias parameter if requested
if
(
bias
)
{
INFINICORE_NN_PARAMETER_INIT
(
bias
,
({
out_features
},
dtype_
,
device
,
0
,
tp_rank_
,
tp_size_
));
}
else
{
bias_
=
Parameter
();
// Default constructed empty parameter
}
break
;
}
}
}
}
Tensor
ColumnParallelLinear
::
forward
(
Tensor
&
input
)
const
{
Tensor
ColumnParallelLinear
::
forward
(
Tensor
&
input
)
const
{
...
@@ -138,26 +238,53 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
...
@@ -138,26 +238,53 @@ RowParallelLinear::RowParallelLinear(size_t in_features, size_t out_features, bo
}
else
{
}
else
{
bias_
=
Parameter
();
// Default constructed empty parameter
bias_
=
Parameter
();
// Default constructed empty parameter
}
}
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
RowParallelLinear
::
RowParallelLinear
(
size_t
in_features
,
size_t
out_features
,
std
::
shared_ptr
<
infinicore
::
quantization
::
BaseQuantization
>
quantization
,
bool
bias
,
// in_features, out_features, bias, static_cast<int>(dtype_));
const
DataType
&
dtype
,
const
Device
&
device
,
Size
tp_rank
,
Size
tp_size
,
infinicclComm_t
communicator
)
:
BaseLinear
(
in_features
,
out_features
,
quantization
,
bias
,
dtype
,
device_
),
tp_rank_
(
tp_rank
),
tp_size_
(
tp_size
),
communicator_
(
communicator
)
{
device_
=
device
;
switch
(
this
->
quantization_
->
get_quant_scheme
())
{
case
infinicore
::
quantization
::
QuantScheme
::
COMPRESSED_TENSOR_W8A8I8
:
{
INFINICORE_NN_PARAMETER_INIT
(
weight
,
({
out_features
,
in_features
},
infinicore
::
DataType
::
I8
,
device
,
1
,
tp_rank_
,
tp_size_
));
INFINICORE_NN_PARAMETER_INIT
(
weight_scale
,
({
out_features
,
1
},
infinicore
::
DataType
::
F32
,
device
,
0
,
0
,
1
));
if
(
bias
)
{
INFINICORE_NN_PARAMETER_INIT
(
bias
,
({
out_features
},
dtype_
,
device
,
0
,
tp_rank_
,
tp_size_
));
}
else
{
bias_
=
Parameter
();
}
break
;
}
default:
{
// Initialize parameters using macro
INFINICORE_NN_PARAMETER_INIT
(
weight
,
({
out_features
,
in_features
},
dtype_
,
device
,
1
,
tp_rank_
,
tp_size_
));
// Register bias parameter if requested
if
(
bias
&&
(
0
==
tp_rank_
))
{
INFINICORE_NN_PARAMETER_INIT
(
bias
,
({
out_features
},
dtype_
,
device
,
0
,
0
,
1
));
}
else
{
bias_
=
Parameter
();
// Default constructed empty parameter
}
// SPDLOG_DEBUG("Created RowParallelLinear module: in_features={}, out_features={}, bias={}, dtype={}",
// in_features, out_features, bias, static_cast<int>(dtype_));
break
;
}
}
}
}
Tensor
RowParallelLinear
::
forward
(
Tensor
&
input
)
const
{
Tensor
RowParallelLinear
::
forward
(
Tensor
&
input
)
const
{
auto
output
=
BaseLinear
::
forward
(
input
);
auto
output
=
BaseLinear
::
forward
(
input
);
if
((
tp_size_
>
1
)
&&
(
communicator_
!=
nullptr
))
{
if
((
tp_size_
>
1
)
&&
(
communicator_
!=
nullptr
))
{
op
::
distributed
::
allreduce_
(
output
,
output
,
INFINICCL_SUM
,
communicator_
);
Size
count
=
output
->
numel
();
DataType
type
=
output
->
dtype
();
infinirtStream_t
stream
=
infinicore
::
context
::
getStream
();
INFINICORE_CHECK_ERROR
(
infinicclAllReduce
(
output
->
data
(),
output
->
data
(),
count
,
static_cast
<
infiniDtype_t
>
(
static_cast
<
int
>
(
type
)),
INFINICCL_SUM
,
communicator_
,
stream
));
INFINICORE_CHECK_ERROR
(
infinirtStreamSynchronize
(
stream
));
// RUN_INFINI(infinirtStreamSynchronize(stream));
}
}
return
output
;
return
output
;
}
}
...
...
src/infinicore/nn/rmsnorm.cc
View file @
8d09630a
...
@@ -21,6 +21,25 @@ Tensor RMSNorm::forward(const Tensor &x) const {
...
@@ -21,6 +21,25 @@ Tensor RMSNorm::forward(const Tensor &x) const {
return
op
::
rms_norm
(
x
,
weight_
,
static_cast
<
float
>
(
eps_
));
return
op
::
rms_norm
(
x
,
weight_
,
static_cast
<
float
>
(
eps_
));
}
}
void
RMSNorm
::
forward_inplace
(
Tensor
&
x
,
Tensor
&
residual
)
const
{
if
(
!
residual
)
{
residual
=
x
;
x
=
op
::
rms_norm
(
x
,
weight_
,
static_cast
<
float
>
(
eps_
));
}
else
{
if
(
device_
.
getType
()
==
Device
::
Type
::
CPU
||
device_
.
getType
()
==
Device
::
Type
::
NVIDIA
||
device_
.
getType
()
==
Device
::
Type
::
ILUVATAR
||
device_
.
getType
()
==
Device
::
Type
::
METAX
||
device_
.
getType
()
==
Device
::
Type
::
MOORE
||
device_
.
getType
()
==
Device
::
Type
::
ALI
)
{
op
::
add_rms_norm_inplace
(
x
,
residual
,
weight_
,
static_cast
<
float
>
(
eps_
));
}
else
{
op
::
add_
(
residual
,
x
,
residual
);
op
::
rms_norm_
(
x
,
residual
,
weight_
,
static_cast
<
float
>
(
eps_
));
}
}
}
std
::
string
RMSNorm
::
extra_repr
()
const
{
std
::
string
RMSNorm
::
extra_repr
()
const
{
return
"RMSNorm(normalized_shape="
+
std
::
to_string
(
normalized_shape_
)
+
", eps="
+
std
::
to_string
(
eps_
)
+
", dtype="
+
std
::
to_string
(
static_cast
<
int
>
(
dtype_
))
+
")"
;
return
"RMSNorm(normalized_shape="
+
std
::
to_string
(
normalized_shape_
)
+
", eps="
+
std
::
to_string
(
eps_
)
+
", dtype="
+
std
::
to_string
(
static_cast
<
int
>
(
dtype_
))
+
")"
;
}
}
...
...
src/infinicore/nn/rope.cc
View file @
8d09630a
...
@@ -16,12 +16,14 @@ RoPE::RoPE(size_t head_dim,
...
@@ -16,12 +16,14 @@ RoPE::RoPE(size_t head_dim,
double
theta
,
double
theta
,
Algo
algo
,
Algo
algo
,
const
DataType
&
dtype
,
const
DataType
&
dtype
,
const
Device
&
device
)
const
Device
&
device
,
std
::
shared_ptr
<
ScalingConfig
>
scaling
)
:
head_dim_
(
head_dim
),
:
head_dim_
(
head_dim
),
max_seq_len_
(
max_seq_len
),
max_seq_len_
(
max_seq_len
),
theta_
(
theta
),
theta_
(
theta
),
algo_
(
algo
),
algo_
(
algo
),
dtype_
(
dtype
)
{
dtype_
(
dtype
),
scaling_
(
scaling
)
{
if
(
head_dim
%
2
!=
0
)
{
if
(
head_dim
%
2
!=
0
)
{
throw
std
::
invalid_argument
(
"head_dim must be even for RoPE, got "
+
std
::
to_string
(
head_dim
));
throw
std
::
invalid_argument
(
"head_dim must be even for RoPE, got "
+
std
::
to_string
(
head_dim
));
}
}
...
@@ -54,14 +56,30 @@ void RoPE::initialize_cache() {
...
@@ -54,14 +56,30 @@ void RoPE::initialize_cache() {
for
(
size_t
j
=
0
;
j
<
cache_dim
;
j
++
)
{
for
(
size_t
j
=
0
;
j
<
cache_dim
;
j
++
)
{
// GPT-J style inverse frequency: theta^(-2j/head_dim)
// GPT-J style inverse frequency: theta^(-2j/head_dim)
// Compute directly in float to avoid double->float casting
// Compute directly in float to avoid double->float casting
float
inv_freq
=
1.0
f
/
std
::
pow
(
static_cast
<
float
>
(
theta_
),
2.0
f
*
static_cast
<
float
>
(
j
)
/
static_cast
<
float
>
(
head_dim_
));
float
inv_freq
;
float
table_factor
=
1.0
f
;
if
(
scaling_
==
nullptr
)
{
inv_freq
=
1.0
f
/
std
::
pow
(
static_cast
<
float
>
(
theta_
),
2.0
f
*
static_cast
<
float
>
(
j
)
/
static_cast
<
float
>
(
head_dim_
));
}
else
if
(
scaling_
->
type
()
==
ScalingType
::
LONGROPE
)
{
std
::
shared_ptr
<
LongRopeConfig
>
lr
=
std
::
dynamic_pointer_cast
<
LongRopeConfig
>
(
scaling_
);
table_factor
=
lr
->
factor
();
float
_ext
;
if
(
pos
<
lr
->
original_max_position_embeddings
())
{
_ext
=
lr
->
short_factor
()[
j
];
}
else
{
_ext
=
lr
->
long_factor
()[
j
];
}
inv_freq
=
1.0
f
/
(
_ext
*
std
::
pow
(
static_cast
<
float
>
(
theta_
),
2.0
f
*
static_cast
<
float
>
(
j
)
/
static_cast
<
float
>
(
head_dim_
)));
}
else
{
inv_freq
=
1.0
f
/
std
::
pow
(
static_cast
<
float
>
(
theta_
),
2.0
f
*
static_cast
<
float
>
(
j
)
/
static_cast
<
float
>
(
head_dim_
));
}
// Compute angle: position * inverse_frequency
// Compute angle: position * inverse_frequency
float
angle
=
static_cast
<
float
>
(
pos
)
*
inv_freq
;
float
angle
=
static_cast
<
float
>
(
pos
)
*
inv_freq
;
// Compute sin and cos directly on float
// Compute sin and cos directly on float
sin_data
[
pos
*
cache_dim
+
j
]
=
std
::
sin
(
angle
);
sin_data
[
pos
*
cache_dim
+
j
]
=
std
::
sin
(
angle
)
*
table_factor
;
cos_data
[
pos
*
cache_dim
+
j
]
=
std
::
cos
(
angle
);
cos_data
[
pos
*
cache_dim
+
j
]
=
std
::
cos
(
angle
)
*
table_factor
;
}
}
}
}
...
...
src/infinicore/ops/add/add.cc
View file @
8d09630a
...
@@ -3,24 +3,24 @@
...
@@ -3,24 +3,24 @@
namespace
infinicore
::
op
{
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Add
::
schema
>
&
Add
::
dispatcher
()
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Add
);
static
common
::
OpDispatcher
<
Add
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
Add
::
execute
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
Add
::
Add
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
c
,
a
,
b
);
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
c
,
a
,
b
);
infinicore
::
context
::
setDevice
(
c
->
device
());
INFINICORE_GRAPH_OP_DISPATCH
(
c
->
device
().
getType
(),
c
,
a
,
b
);
dispatcher
().
lookup
(
c
->
device
().
getType
())(
c
,
a
,
b
);
}
}
Tensor
add
(
Tensor
a
,
Tensor
b
)
{
void
Add
::
execute
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
Add
,
c
,
a
,
b
);
}
Tensor
add
(
const
Tensor
&
a
,
const
Tensor
&
b
)
{
auto
c
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
auto
c
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
add_
(
c
,
a
,
b
);
add_
(
c
,
a
,
b
);
return
c
;
return
c
;
}
}
void
add_
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
void
add_
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
Add
::
execute
(
c
,
a
,
b
);
Add
::
execute
(
c
,
a
,
b
);
}
}
...
...
src/infinicore/ops/add/add_infiniop.cc
View file @
8d09630a
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/add.hpp"
#include "infinicore/ops/add.hpp"
#include "infinicore/ops/common/cache.hpp"
#include
<
infiniop
.h>
#include
"../
infiniop
_impl.hpp"
namespace
infinicore
::
op
::
add_impl
::
infiniop
{
namespace
infinicore
::
op
::
add_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopAddDescriptor_t
>
caches
(
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Add
,
100
);
100
,
// capacity
[](
infiniopAddDescriptor_t
&
desc
)
{
struct
PlannedMeta
{
if
(
desc
!=
nullptr
)
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
INFINICORE_CHECK_ERROR
(
infiniopDestroyAddDescriptor
(
desc
));
graph
::
GraphTensor
workspace
,
c
,
a
,
b
;
desc
=
nullptr
;
};
}
});
void
calculate
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
void
*
plan
(
Tensor
c
,
const
Tensor
&
a
,
const
Tensor
&
b
)
{
size_t
seed
=
hash_combine
(
c
,
b
,
a
);
size_t
seed
=
hash_combine
(
c
,
b
,
a
);
auto
device
=
context
::
getDevice
();
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
auto
&
cache
=
caches
.
getCache
(
device
);
Descriptor
,
descriptor
,
Add
,
seed
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
());
auto
desc_opt
=
cache
.
get
(
seed
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
Add
,
descriptor
);
infiniopAddDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
return
new
PlannedMeta
{
INFINICORE_CHECK_ERROR
(
infiniopCreateAddDescriptor
(
descriptor
,
context
::
getInfiniopHandle
(
device
),
&
desc
,
graph
::
GraphTensor
(
workspace
),
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
graph
::
GraphTensor
(
c
),
cache
.
put
(
seed
,
desc
);
graph
::
GraphTensor
(
a
),
}
else
{
graph
::
GraphTensor
(
b
)};
desc
=
*
desc_opt
;
}
}
size_t
workspace_size
=
0
;
void
run
(
void
*
planned_meta
)
{
INFINICORE_CHECK_ERROR
(
infiniopGetAddWorkspaceSize
(
desc
,
&
workspace_size
));
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopAdd
(
INFINICORE_CHECK_ERROR
(
infiniopAdd
(
desc
,
workspace
->
data
(),
workspace_size
,
planned
->
descriptor
->
desc
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
context
::
getStream
()));
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
c
->
data
(),
planned
->
a
->
data
(),
planned
->
b
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
}
static
bool
registered
=
[]()
{
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Add
,
&
plan
,
&
run
,
&
cleanup
);
Add
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::add_impl::infiniop
}
// namespace infinicore::op::add_impl::infiniop
src/infinicore/ops/add_rms_norm/add_rms_norm.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/add_rms_norm.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
AddRMSNorm
);
AddRMSNorm
::
AddRMSNorm
(
Tensor
y
,
Tensor
residual_out
,
const
Tensor
&
a
,
const
Tensor
&
b
,
const
Tensor
&
weight
,
float
epsilon
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
y
,
residual_out
,
a
,
b
,
weight
);
INFINICORE_GRAPH_OP_DISPATCH
(
y
->
device
().
getType
(),
y
,
residual_out
,
a
,
b
,
weight
,
epsilon
);
}
void
AddRMSNorm
::
execute
(
Tensor
y
,
Tensor
residual_out
,
const
Tensor
&
a
,
const
Tensor
&
b
,
const
Tensor
&
weight
,
float
epsilon
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
AddRMSNorm
,
y
,
residual_out
,
a
,
b
,
weight
,
epsilon
);
}
std
::
pair
<
Tensor
,
Tensor
>
add_rms_norm
(
const
Tensor
&
a
,
const
Tensor
&
b
,
const
Tensor
&
weight
,
float
epsilon
)
{
auto
y
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
auto
residual_out
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
add_rms_norm_
(
y
,
residual_out
,
a
,
b
,
weight
,
epsilon
);
return
std
::
make_pair
(
y
,
residual_out
);
}
void
add_rms_norm_
(
Tensor
out
,
Tensor
residual
,
const
Tensor
&
a
,
const
Tensor
&
b
,
const
Tensor
&
weight
,
float
epsilon
)
{
AddRMSNorm
::
execute
(
out
,
residual
,
a
,
b
,
weight
,
epsilon
);
}
void
add_rms_norm_inplace
(
Tensor
input
,
Tensor
residual
,
const
Tensor
&
weight
,
float
epsilon
)
{
add_rms_norm_
(
input
,
residual
,
input
,
residual
,
weight
,
epsilon
);
}
}
// namespace infinicore::op
src/infinicore/ops/add_rms_norm/add_rms_norm_infiniop.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/add_rms_norm.hpp"
#include "../infiniop_impl.hpp"
namespace
infinicore
::
op
::
add_rms_norm_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
AddRMSNorm
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
out
,
residual
,
a
,
b
,
weight
;
float
epsilon
;
};
void
*
plan
(
Tensor
y
,
Tensor
residual_out
,
const
Tensor
&
a
,
const
Tensor
&
b
,
const
Tensor
&
weight
,
float
epsilon
)
{
size_t
seed
=
hash_combine
(
y
,
residual_out
,
a
,
b
,
weight
,
epsilon
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
AddRMSNorm
,
seed
,
y
->
desc
(),
residual_out
->
desc
(),
a
->
desc
(),
b
->
desc
(),
weight
->
desc
(),
epsilon
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
AddRMSNorm
,
descriptor
);
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
y
),
graph
::
GraphTensor
(
residual_out
),
graph
::
GraphTensor
(
a
),
graph
::
GraphTensor
(
b
),
graph
::
GraphTensor
(
weight
),
epsilon
};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopAddRMSNorm
(
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
out
->
data
(),
planned
->
residual
->
data
(),
planned
->
a
->
data
(),
planned
->
b
->
data
(),
planned
->
weight
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
AddRMSNorm
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::add_rms_norm_impl::infiniop
src/infinicore/ops/causal_softmax/causal_softmax.cc
View file @
8d09630a
#include "infinicore/ops/causal_softmax.hpp"
#include "infinicore/ops/causal_softmax.hpp"
#include "../../utils.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace
infinicore
::
op
{
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
CausalSoftmax
::
schema
>
&
CausalSoftmax
::
dispatcher
()
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
CausalSoftmax
);
static
common
::
OpDispatcher
<
CausalSoftmax
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
CausalSoftmax
::
execute
(
Tensor
output
,
Tensor
input
)
{
CausalSoftmax
::
CausalSoftmax
(
Tensor
output
,
const
Tensor
&
input
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
output
,
input
);
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
output
,
input
);
infinicore
::
context
::
setDevice
(
output
->
device
());
INFINICORE_GRAPH_OP_DISPATCH
(
output
->
device
().
getType
(),
output
,
input
);
auto
device_type
=
output
->
device
().
getType
();
}
auto
func
=
dispatcher
().
lookup
(
device_type
);
if
(
func
==
nullptr
)
{
throw
std
::
runtime_error
(
"No CausalSoftmax implementation found for device type: "
+
std
::
to_string
(
static_cast
<
int
>
(
device_type
)));
}
func
(
output
,
input
);
void
CausalSoftmax
::
execute
(
Tensor
output
,
const
Tensor
&
input
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
CausalSoftmax
,
output
,
input
);
}
}
Tensor
causal_softmax
(
Tensor
input
)
{
Tensor
causal_softmax
(
const
Tensor
&
input
)
{
Shape
shape
=
input
->
shape
();
auto
output
=
Tensor
::
empty
(
input
->
shape
(),
input
->
dtype
(),
input
->
device
());
auto
output
=
Tensor
::
empty
(
shape
,
input
->
dtype
(),
input
->
device
());
causal_softmax_
(
output
,
input
);
causal_softmax_
(
output
,
input
);
return
output
;
return
output
;
}
}
void
causal_softmax_
(
Tensor
output
,
Tensor
input
)
{
void
causal_softmax_
(
Tensor
output
,
const
Tensor
&
input
)
{
CausalSoftmax
::
execute
(
output
,
input
);
CausalSoftmax
::
execute
(
output
,
input
);
}
}
}
// namespace infinicore::op
}
// namespace infinicore::op
src/infinicore/ops/causal_softmax/causal_softmax_infiniop.cc
View file @
8d09630a
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/causal_softmax.hpp"
#include "infinicore/ops/causal_softmax.hpp"
#include "infinicore/ops/common/cache.hpp"
#include
<
infiniop
.h>
#include
"../
infiniop
_impl.hpp"
namespace
infinicore
::
op
::
causal_softmax_impl
::
infiniop
{
namespace
infinicore
::
op
::
causal_softmax_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopCausalSoftmaxDescriptor_t
>
caches
(
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
CausalSoftmax
,
100
);
100
,
// capacity
[](
infiniopCausalSoftmaxDescriptor_t
&
desc
)
{
struct
PlannedMeta
{
if
(
desc
!=
nullptr
)
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
INFINICORE_CHECK_ERROR
(
infiniopDestroyCausalSoftmaxDescriptor
(
desc
));
graph
::
GraphTensor
workspace
,
output
,
input
;
desc
=
nullptr
;
};
}
});
void
calculate
(
Tensor
output
,
Tensor
input
)
{
void
*
plan
(
Tensor
output
,
const
Tensor
&
input
)
{
size_t
seed
=
hash_combine
(
output
,
input
);
size_t
seed
=
hash_combine
(
output
,
input
);
auto
device
=
context
::
getDevice
();
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
auto
&
cache
=
caches
.
getCache
(
device
);
Descriptor
,
descriptor
,
CausalSoftmax
,
seed
,
output
->
desc
(),
input
->
desc
());
auto
desc_opt
=
cache
.
get
(
seed
);
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
CausalSoftmax
,
descriptor
);
infiniopCausalSoftmaxDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
return
new
PlannedMeta
{
INFINICORE_CHECK_ERROR
(
infiniopCreateCausalSoftmaxDescriptor
(
descriptor
,
context
::
getInfiniopHandle
(
device
),
&
desc
,
graph
::
GraphTensor
(
workspace
),
output
->
desc
(),
input
->
desc
()));
graph
::
GraphTensor
(
output
),
cache
.
put
(
seed
,
desc
);
graph
::
GraphTensor
(
input
)};
}
else
{
}
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
void
run
(
void
*
planned_meta
)
{
INFINICORE_CHECK_ERROR
(
infiniopGetCausalSoftmaxWorkspaceSize
(
desc
,
&
workspace_size
));
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopCausalSoftmax
(
INFINICORE_CHECK_ERROR
(
infiniopCausalSoftmax
(
desc
,
workspace
->
data
(),
workspace_size
,
planned
->
descriptor
->
desc
,
output
->
data
(),
input
->
data
(),
context
::
getStream
()));
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
output
->
data
(),
planned
->
input
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
}
static
bool
registered
=
[]()
{
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
CausalSoftmax
,
&
plan
,
&
run
,
&
cleanup
);
CausalSoftmax
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::causal_softmax_impl::infiniop
}
// namespace infinicore::op::causal_softmax_impl::infiniop
src/infinicore/ops/dequantize_awq/dequantize_awq.cc
0 → 100644
View file @
8d09630a
#include "infinicore/ops/dequantize_awq.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
DequantizeAWQ
);
DequantizeAWQ
::
DequantizeAWQ
(
Tensor
x
,
const
Tensor
&
x_packed
,
const
Tensor
&
x_scale
,
const
Tensor
&
x_zeros
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
x
,
x_packed
,
x_scale
,
x_zeros
);
INFINICORE_GRAPH_OP_DISPATCH
(
x
->
device
().
getType
(),
x
,
x_packed
,
x_scale
,
x_zeros
);
}
void
DequantizeAWQ
::
execute
(
Tensor
x
,
const
Tensor
&
x_packed
,
const
Tensor
&
x_scale
,
const
Tensor
&
x_zeros
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
DequantizeAWQ
,
x
,
x_packed
,
x_scale
,
x_zeros
);
}
void
dequantize_awq_
(
Tensor
x
,
const
Tensor
&
x_packed
,
const
Tensor
&
x_scale
,
const
Tensor
&
x_zeros
)
{
DequantizeAWQ
::
execute
(
x
,
x_packed
,
x_scale
,
x_zeros
);
}
}
// namespace infinicore::op
Prev
1
2
3
4
5
6
7
8
9
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment