Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
22804eaa
Unverified
Commit
22804eaa
authored
Sep 02, 2025
by
blkmjsian
Committed by
GitHub
Sep 02, 2025
Browse files
[T2-3-1]blkmjsian
- deepseek - jiuge 4B awq
parent
5c6000ec
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
783 additions
and
40 deletions
+783
-40
src/models/inference_context.cpp
src/models/inference_context.cpp
+87
-13
src/models/inference_context.hpp
src/models/inference_context.hpp
+51
-5
src/models/jiuge/jiuge.cpp
src/models/jiuge/jiuge.cpp
+12
-11
src/models/jiuge/jiuge_impl.hpp
src/models/jiuge/jiuge_impl.hpp
+3
-5
src/models/jiuge_awq/jiuge_awq.cpp
src/models/jiuge_awq/jiuge_awq.cpp
+397
-0
src/models/jiuge_awq/jiuge_awq.hpp
src/models/jiuge_awq/jiuge_awq.hpp
+82
-0
src/models/jiuge_awq/jiuge_awq_weight.cpp
src/models/jiuge_awq/jiuge_awq_weight.cpp
+128
-0
src/tensor.hpp
src/tensor.hpp
+2
-1
src/tensor/tensor.cpp
src/tensor/tensor.cpp
+19
-5
xmake.lua
xmake.lua
+2
-0
No files found.
src/models/inference_context.cpp
View file @
22804eaa
...
...
@@ -2,12 +2,12 @@
#include "../tensor.hpp"
#include "../utils.hpp"
InferenceContext
::
InferenceContext
(
DeviceResource
*
rsrc
,
CacheManager
*
cache_manager
,
infinirtStream_t
stream
)
:
rsrc
(
rsrc
),
cache_manager
(
cache_manager
),
stream
(
stream
)
{}
InferenceContext
::
InferenceContext
(
infiniopHandle_t
op_handle_
,
std
::
shared_ptr
<
MemoryPool
>
memory_pool_
,
CacheManager
*
cache_manager
,
infinirtStream_t
stream
)
:
op_handle
(
op_handle_
),
memory_pool
(
memory_pool_
),
cache_manager
(
cache_manager
),
stream
(
stream
)
{}
void
InferenceContext
::
ensure_workspace
(
size_t
required_size
)
{
if
(
required_size
>
current_workspace_size
||
!
workspace_storage
)
{
workspace_storage
=
Storage
::
createFromPool
(
required_size
,
rsrc
->
memory_pool
);
workspace_storage
=
Storage
::
createFromPool
(
required_size
,
memory_pool
);
current_workspace_size
=
required_size
;
}
}
...
...
@@ -19,7 +19,7 @@ void InferenceContext::add(std::shared_ptr<Tensor> c,
infiniopAddDescriptor_t
desc
;
if
(
!
cache_manager
->
getAddDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateAddDescriptor
(
rsrc
->
handle
,
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
RUN_INFINI
(
infiniopCreateAddDescriptor
(
op_
handle
,
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache_manager
->
putAddDescriptor
(
key
,
desc
);
}
...
...
@@ -42,7 +42,7 @@ void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
infiniopRMSNormDescriptor_t
desc
;
if
(
!
cache_manager
->
getRMSNormDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateRMSNormDescriptor
(
rsrc
->
handle
,
&
desc
,
y
->
desc
(),
x
->
desc
(),
w
->
desc
(),
epsilon
));
op_
handle
,
&
desc
,
y
->
desc
(),
x
->
desc
(),
w
->
desc
(),
epsilon
));
cache_manager
->
putRMSNormDescriptor
(
key
,
desc
);
}
...
...
@@ -64,7 +64,7 @@ void InferenceContext::gemm(std::shared_ptr<Tensor> c,
infiniopGemmDescriptor_t
desc
;
if
(
!
cache_manager
->
getGemmDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateGemmDescriptor
(
rsrc
->
handle
,
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
RUN_INFINI
(
infiniopCreateGemmDescriptor
(
op_
handle
,
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache_manager
->
putGemmDescriptor
(
key
,
desc
);
}
...
...
@@ -84,7 +84,7 @@ void InferenceContext::rearrange(std::shared_ptr<Tensor> dst,
infiniopRearrangeDescriptor_t
desc
;
if
(
!
cache_manager
->
getRearrangeDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateRearrangeDescriptor
(
rsrc
->
handle
,
&
desc
,
dst
->
desc
(),
src
->
desc
()));
RUN_INFINI
(
infiniopCreateRearrangeDescriptor
(
op_
handle
,
&
desc
,
dst
->
desc
(),
src
->
desc
()));
cache_manager
->
putRearrangeDescriptor
(
key
,
desc
);
}
...
...
@@ -105,7 +105,7 @@ void InferenceContext::rope(std::shared_ptr<Tensor> q,
infiniopRoPEDescriptor_t
desc
;
if
(
!
cache_manager
->
getRoPEDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateRoPEDescriptor
(
rsrc
->
handle
,
&
desc
,
q
->
desc
(),
k
->
desc
(),
op_
handle
,
&
desc
,
q
->
desc
(),
k
->
desc
(),
pos
->
desc
(),
sin
->
desc
(),
cos
->
desc
()));
cache_manager
->
putRoPEDescriptor
(
key
,
desc
);
}
...
...
@@ -121,6 +121,32 @@ void InferenceContext::rope(std::shared_ptr<Tensor> q,
sin
->
data
(),
cos
->
data
(),
stream
));
}
void
InferenceContext
::
rope_v2
(
std
::
shared_ptr
<
Tensor
>
q
,
std
::
shared_ptr
<
Tensor
>
k
,
std
::
shared_ptr
<
Tensor
>
pos
,
std
::
shared_ptr
<
Tensor
>
sin
,
std
::
shared_ptr
<
Tensor
>
cos
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
q
,
k
,
pos
,
sin
,
cos
);
infiniopRoPEv2Descriptor_t
desc
;
if
(
!
cache_manager
->
getRoPEv2Descriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateRoPEv2Descriptor
(
op_handle
,
&
desc
,
q
->
desc
(),
k
->
desc
(),
pos
->
desc
(),
sin
->
desc
(),
cos
->
desc
()));
cache_manager
->
putRoPEv2Descriptor
(
key
,
desc
);
}
size_t
workspace_size
=
0
;
RUN_INFINI
(
infiniopGetRoPEv2WorkspaceSize
(
desc
,
&
workspace_size
));
ensure_workspace
(
workspace_size
);
void
*
workspace
=
workspace_storage
->
memory
();
RUN_INFINI
(
infiniopRoPEv2
(
desc
,
workspace
,
workspace_size
,
q
->
data
(),
k
->
data
(),
pos
->
data
(),
sin
->
data
(),
cos
->
data
(),
stream
));
}
void
InferenceContext
::
causalSoftmax
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
y
,
x
);
...
...
@@ -128,7 +154,7 @@ void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
infiniopCausalSoftmaxDescriptor_t
desc
;
if
(
!
cache_manager
->
getCausalSoftmaxDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateCausalSoftmaxDescriptor
(
rsrc
->
handle
,
&
desc
,
y
->
desc
(),
x
->
desc
()));
op_
handle
,
&
desc
,
y
->
desc
(),
x
->
desc
()));
cache_manager
->
putCausalSoftmaxDescriptor
(
key
,
desc
);
}
...
...
@@ -141,6 +167,31 @@ void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
y
->
data
(),
x
->
data
(),
stream
));
}
void
InferenceContext
::
topkrouter
(
std
::
shared_ptr
<
Tensor
>
values
,
// F32
std
::
shared_ptr
<
Tensor
>
indices
,
// I32
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
correction_bias
,
// F32
float
routed_scaling_factor
,
size_t
topk
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
values
,
indices
,
x
,
correction_bias
);
infiniopTopkrouterDescriptor_t
desc
;
if
(
!
cache_manager
->
getTopkrouterDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateTopkrouterDescriptor
(
op_handle
,
&
desc
,
x
->
desc
(),
correction_bias
->
desc
()));
cache_manager
->
putTopkrouterDescriptor
(
key
,
desc
);
}
size_t
workspace_size
=
0
;
RUN_INFINI
(
infiniopGetTopkrouterWorkspaceSize
(
desc
,
&
workspace_size
));
ensure_workspace
(
workspace_size
);
void
*
workspace
=
workspace_storage
->
memory
();
RUN_INFINI
(
infiniopTopkrouter
(
desc
,
workspace
,
workspace_size
,
values
->
data
(),
indices
->
data
(),
x
->
data
(),
correction_bias
->
data
(),
routed_scaling_factor
,
topk
,
stream
));
}
void
InferenceContext
::
swiglu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
up
,
std
::
shared_ptr
<
Tensor
>
gate
)
{
...
...
@@ -149,7 +200,7 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
infiniopSwiGLUDescriptor_t
desc
;
if
(
!
cache_manager
->
getSwiGLUDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateSwiGLUDescriptor
(
rsrc
->
handle
,
&
desc
,
out
->
desc
(),
up
->
desc
(),
gate
->
desc
()));
op_
handle
,
&
desc
,
out
->
desc
(),
up
->
desc
(),
gate
->
desc
()));
cache_manager
->
putSwiGLUDescriptor
(
key
,
desc
);
}
...
...
@@ -170,7 +221,7 @@ void InferenceContext::randomSample(std::shared_ptr<Tensor> out,
infiniopRandomSampleDescriptor_t
desc
;
if
(
!
cache_manager
->
getRandomSampleDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateRandomSampleDescriptor
(
rsrc
->
handle
,
&
desc
,
out
->
desc
(),
prob
->
desc
()));
op_
handle
,
&
desc
,
out
->
desc
(),
prob
->
desc
()));
cache_manager
->
putRandomSampleDescriptor
(
key
,
desc
);
}
...
...
@@ -209,8 +260,8 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
if
(
beta
==
0.0
)
{
gemm
(
c
,
a
,
b
,
alpha
,
1.0
);
}
else
{
auto
c_copy
=
Tensor
::
buffer
(
c
->
dtype
(),
c
->
shape
(),
rsrc
->
memory_pool
);
c_copy
->
copyFrom
(
c
,
rsrc
->
handle
,
stream
);
auto
c_copy
=
Tensor
::
buffer
(
c
->
dtype
(),
c
->
shape
(),
memory_pool
);
c_copy
->
copyFrom
(
c
,
op_
handle
,
stream
);
gemm
(
c
,
a
,
b
,
alpha
,
beta
);
add
(
c
,
c
,
c_copy
);
}
...
...
@@ -231,3 +282,26 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
add
(
c
,
c
,
bias
->
view_as
(
c
->
shape
(),
strides
));
}
}
void
InferenceContext
::
dequant
(
std
::
shared_ptr
<
Tensor
>
weight
,
std
::
shared_ptr
<
Tensor
>
in_w
,
std
::
shared_ptr
<
Tensor
>
in_s
,
std
::
shared_ptr
<
Tensor
>
in_z
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
weight
,
in_w
,
in_s
,
in_z
);
infiniopDequantizeDescriptor_t
desc
;
if
(
!
cache_manager
->
getDequantizeDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateDequantizeDescriptor
(
op_handle
,
&
desc
,
weight
->
desc
(),
in_w
->
desc
(),
in_s
->
desc
(),
in_z
->
desc
()));
cache_manager
->
putDequantizeDescriptor
(
key
,
desc
);
}
size_t
workspace_size
=
0
;
RUN_INFINI
(
infiniopGetDequantizeWorkspaceSize
(
desc
,
&
workspace_size
));
ensure_workspace
(
workspace_size
);
void
*
workspace
=
workspace_storage
->
memory
();
RUN_INFINI
(
infiniopDequantize
(
desc
,
workspace
,
workspace_size
,
weight
->
data
(),
in_w
->
data
(),
in_s
->
data
(),
in_z
->
data
(),
0
,
0
,
0
,
stream
));
}
src/models/inference_context.hpp
View file @
22804eaa
#pragma once
#include "cache_manager.hpp"
#include "jiuge/jiuge_impl.hpp"
#include "jiuge/jiuge_weight.hpp"
#include "../cache_manager/opcache_manager.hpp"
#include <cassert>
struct
InferenceContext
{
DeviceResource
*
rsrc
;
infiniopHandle_t
op_handle
;
std
::
shared_ptr
<
MemoryPool
>
memory_pool
;
CacheManager
*
cache_manager
;
infinirtStream_t
stream
;
std
::
shared_ptr
<
Storage
>
workspace_storage
;
size_t
current_workspace_size
=
0
;
InferenceContext
(
DeviceResource
*
rsrc
,
CacheManager
*
cache_manager
,
infinirtStream_t
stream
);
InferenceContext
(
infiniopHandle_t
op_handle
,
std
::
shared_ptr
<
MemoryPool
>
memory_pool
,
CacheManager
*
cache_manager
,
infinirtStream_t
stream
);
void
ensure_workspace
(
size_t
required_size
);
...
...
@@ -34,8 +34,21 @@ struct InferenceContext {
std
::
shared_ptr
<
Tensor
>
pos
,
std
::
shared_ptr
<
Tensor
>
sin
,
std
::
shared_ptr
<
Tensor
>
cos
);
void
rope_v2
(
std
::
shared_ptr
<
Tensor
>
q
,
std
::
shared_ptr
<
Tensor
>
k
,
std
::
shared_ptr
<
Tensor
>
pos
,
std
::
shared_ptr
<
Tensor
>
sin
,
std
::
shared_ptr
<
Tensor
>
cos
);
void
causalSoftmax
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
);
void
topkrouter
(
std
::
shared_ptr
<
Tensor
>
values
,
// F32
std
::
shared_ptr
<
Tensor
>
indices
,
// I32
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
correction_bias
,
// F32
float
routed_scaling_factor
,
size_t
topk
);
void
swiglu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
up
,
std
::
shared_ptr
<
Tensor
>
gate
);
...
...
@@ -49,6 +62,10 @@ struct InferenceContext {
float
alpha
,
float
beta
,
std
::
shared_ptr
<
Tensor
>
residual
,
std
::
shared_ptr
<
Tensor
>
bias
);
void
dequant
(
std
::
shared_ptr
<
Tensor
>
weight
,
std
::
shared_ptr
<
Tensor
>
in_w
,
std
::
shared_ptr
<
Tensor
>
in_s
,
std
::
shared_ptr
<
Tensor
>
in_z
);
};
namespace
{
...
...
@@ -88,10 +105,31 @@ inline void rope(std::shared_ptr<Tensor> q, std::shared_ptr<Tensor> k,
getInferenceContext
().
rope
(
q
,
k
,
pos
,
sin
,
cos
);
}
inline
void
rope_v2
(
std
::
shared_ptr
<
Tensor
>
q
,
std
::
shared_ptr
<
Tensor
>
k
,
std
::
shared_ptr
<
Tensor
>
pos
,
std
::
shared_ptr
<
Tensor
>
sin
,
std
::
shared_ptr
<
Tensor
>
cos
)
{
getInferenceContext
().
rope_v2
(
q
,
k
,
pos
,
sin
,
cos
);
}
inline
void
causalSoftmax
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
)
{
getInferenceContext
().
causalSoftmax
(
y
,
x
);
}
inline
void
topkrouter
(
std
::
shared_ptr
<
Tensor
>
values
,
// F32
std
::
shared_ptr
<
Tensor
>
indices
,
// I32
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
correction_bias
,
// F32
float
routed_scaling_factor
,
size_t
topk
)
{
getInferenceContext
().
topkrouter
(
values
,
// F32
indices
,
// I32
x
,
correction_bias
,
// F32
routed_scaling_factor
,
topk
);
}
inline
void
swiglu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
up
,
std
::
shared_ptr
<
Tensor
>
gate
)
{
getInferenceContext
().
swiglu
(
out
,
up
,
gate
);
...
...
@@ -107,3 +145,11 @@ inline void linear(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a,
std
::
shared_ptr
<
Tensor
>
residual
,
std
::
shared_ptr
<
Tensor
>
bias
)
{
getInferenceContext
().
linear
(
c
,
a
,
b
,
alpha
,
beta
,
residual
,
bias
);
}
inline
void
dequant_linear
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w_w
,
std
::
shared_ptr
<
Tensor
>
w_s
,
std
::
shared_ptr
<
Tensor
>
w_z
,
float
alpha
,
float
beta
,
std
::
shared_ptr
<
Tensor
>
residual
,
std
::
shared_ptr
<
Tensor
>
bias
)
{
auto
w
=
Tensor
::
buffer
(
x
->
dtype
(),
{
x
->
shape
()[
1
],
out
->
shape
()[
1
]},
getInferenceContext
().
memory_pool
);
getInferenceContext
().
dequant
(
w
,
w_w
,
w_s
,
w_z
);
getInferenceContext
().
linear
(
out
,
x
,
w
,
alpha
,
beta
,
residual
,
bias
);
}
src/models/jiuge/jiuge.cpp
View file @
22804eaa
...
...
@@ -10,7 +10,7 @@
#include <thread>
#include <vector>
void
createDeviceResource
(
DeviceResource
*
rsrc
,
const
JiugeMeta
*
meta
,
void
createDeviceResource
(
Jiuge
DeviceResource
*
rsrc
,
const
JiugeMeta
*
meta
,
const
JiugeWeights
*
weights
,
infiniDevice_t
device
,
int
idev
,
int
ndev
,
int
dev_id
,
...
...
@@ -44,7 +44,7 @@ void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
auto
memory_pool
=
std
::
make_shared
<
MemoryPool
>
(
128
*
1024
*
1024
);
*
rsrc
=
DeviceResource
{
*
rsrc
=
Jiuge
DeviceResource
{
device
,
dev_id
,
handle
,
...
...
@@ -67,7 +67,7 @@ void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
RUN_INFINI
(
infinirtDeviceSynchronize
());
}
void
releaseDeviceResource
(
DeviceResource
&
res
)
{
void
releaseDeviceResource
(
Jiuge
DeviceResource
&
res
)
{
infinirtDeviceSynchronize
();
// Release individual Tensors
res
.
w_in_embd
.
reset
();
...
...
@@ -111,7 +111,7 @@ void releaseDeviceResource(DeviceResource &res) {
res
.
comm
=
nullptr
;
}
void
inferDeviceBatch
(
const
JiugeMeta
&
meta
,
DeviceResource
&
rsrc
,
void
inferDeviceBatch
(
const
JiugeMeta
&
meta
,
Jiuge
DeviceResource
&
rsrc
,
uint32_t
idev
,
uint32_t
ndev
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
...
...
@@ -298,7 +298,7 @@ void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
}
__C
void
inferBatch
(
struct
JiugeModel
*
model
,
inferBatch
Jiuge
(
struct
JiugeModel
*
model
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
KVCache
**
kv_caches
,
...
...
@@ -331,7 +331,7 @@ inferBatch(struct JiugeModel *model,
}
__C
void
forwardBatch
(
struct
JiugeModel
*
model
,
forwardBatch
Jiuge
(
struct
JiugeModel
*
model
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
KVCache
**
kv_caches
,
...
...
@@ -362,16 +362,17 @@ forwardBatch(struct JiugeModel *model,
}
}
void
launchDevice
(
const
JiugeMeta
&
meta
,
const
JiugeWeights
*
weights
,
DeviceResource
*
rsrc
,
InferState
&
state
,
InferRequest
&
req
,
void
launchDevice
(
const
JiugeMeta
&
meta
,
const
JiugeWeights
*
weights
,
Jiuge
DeviceResource
*
rsrc
,
InferState
&
state
,
InferRequest
&
req
,
infiniDevice_t
device
,
int
idev
,
int
ndev
,
int
dev_id
,
infinicclComm_t
comm
)
{
// Create Device Resource
createDeviceResource
(
rsrc
,
&
meta
,
weights
,
device
,
idev
,
ndev
,
dev_id
,
comm
);
CacheManager
cache_manager
(
100
);
InferenceContext
ctx
(
rsrc
,
&
cache_manager
,
rsrc
->
stream
);
InferenceContext
ctx
(
rsrc
->
handle
,
rsrc
->
memory_pool
,
&
cache_manager
,
rsrc
->
stream
);
// Set the inference context for this thread
setInferenceContext
(
&
ctx
);
// Create Device Resource
createDeviceResource
(
rsrc
,
&
meta
,
weights
,
device
,
idev
,
ndev
,
dev_id
,
comm
);
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
state
.
mtx
);
state
.
loaded
=
true
;
...
...
@@ -406,7 +407,7 @@ JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infi
int
ndev
=
int
(
device_ids
.
size
());
device
=
device_
;
dev_ids
=
device_ids
;
dev_resources
=
std
::
vector
<
DeviceResource
>
(
ndev
);
dev_resources
=
std
::
vector
<
Jiuge
DeviceResource
>
(
ndev
);
states
=
std
::
vector
<
InferState
>
(
ndev
);
threads
.
resize
(
ndev
);
RUN_INFINI
(
infinirtInit
());
...
...
src/models/jiuge/jiuge_impl.hpp
View file @
22804eaa
...
...
@@ -12,7 +12,7 @@
#include <thread>
#include <vector>
struct
DeviceResource
{
struct
Jiuge
DeviceResource
{
// Device
infiniDevice_t
device
;
int
device_id
;
...
...
@@ -56,7 +56,7 @@ struct JiugeModel {
JiugeMeta
meta
;
infiniDevice_t
device
;
std
::
vector
<
int
>
dev_ids
;
std
::
vector
<
DeviceResource
>
dev_resources
;
std
::
vector
<
Jiuge
DeviceResource
>
dev_resources
;
std
::
vector
<
InferState
>
states
;
std
::
vector
<
std
::
thread
>
threads
;
InferRequest
req
;
...
...
@@ -64,8 +64,6 @@ struct JiugeModel {
JiugeModel
(
const
JiugeMeta
*
,
const
JiugeWeights
*
,
infiniDevice_t
device
,
std
::
vector
<
int
>
device_ids
);
};
struct
KVCache
{
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>>
k
,
v
;
};
#include "../../cache.hpp"
#endif
src/models/jiuge_awq/jiuge_awq.cpp
0 → 100644
View file @
22804eaa
#include "jiuge_awq.hpp"
#include "../../tensor.hpp"
#include "../../utils.hpp"
#include "../inference_context.hpp"
#include <random>
#include <thread>
#include <vector>
void
createDeviceResource
(
DeviceResource
*
rsrc
,
const
JiugeAWQMeta
*
meta
,
std
::
shared_ptr
<
JiugeAWQDeviceWeight
>
weights
,
infiniDevice_t
device
,
int
idev
,
int
ndev
,
int
dev_id
,
infinicclComm_t
comm
)
{
RUN_INFINI
(
infinirtSetDevice
(
device
,
dev_id
));
infiniopHandle_t
handle
;
infiniopCreateHandle
(
&
handle
);
infinirtStream_t
stream
;
infinirtStreamCreate
(
&
stream
);
auto
memory_pool
=
std
::
make_shared
<
MemoryPool
>
(
128
*
1024
*
1024
);
*
rsrc
=
DeviceResource
{
device
,
dev_id
,
handle
,
weights
,
stream
,
comm
,
memory_pool
,
};
RUN_INFINI
(
infinirtDeviceSynchronize
());
}
void
releaseDeviceResource
(
DeviceResource
&
res
)
{
infinirtDeviceSynchronize
();
// Release individual Tensors
infiniopDestroyHandle
(
res
.
handle
);
res
.
handle
=
nullptr
;
infinirtStreamDestroy
(
res
.
stream
);
res
.
stream
=
nullptr
;
infinicclCommDestroy
(
res
.
comm
);
res
.
comm
=
nullptr
;
}
void
inferDeviceBatch
(
const
JiugeAWQMeta
*
meta
,
DeviceResource
&
rsrc
,
uint32_t
idev
,
uint32_t
ndev
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
KVCache
**
kv_caches
,
const
float
*
temperature
,
const
uint32_t
*
topk
,
const
float
*
topp
,
uint32_t
*
output
,
void
*
last_logits
)
{
auto
nlayer
=
meta
->
nlayer
;
auto
nkvh
=
meta
->
nkvh
/
ndev
;
auto
nh
=
meta
->
nh
/
ndev
;
auto
ngroup
=
nh
/
nkvh
;
// auto dctx = meta.dctx;
auto
dh
=
meta
->
dh
;
auto
d
=
meta
->
d
;
auto
dt_logits
=
meta
->
dt_logits
;
auto
di
=
meta
->
di
/
ndev
;
auto
dvoc
=
meta
->
dvoc
;
auto
stream
=
rsrc
.
stream
;
auto
weight
=
rsrc
.
weights
;
bool
has_qkv_bias
=
meta
->
has_qkv_bias
;
// Allocate buffers
auto
logits_in
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
d
},
rsrc
.
memory_pool
);
auto
logits_out
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
d
},
rsrc
.
memory_pool
);
auto
q_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
nh
*
dh
},
rsrc
.
memory_pool
);
auto
k_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
nkvh
*
dh
},
rsrc
.
memory_pool
);
auto
v_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
nkvh
*
dh
},
rsrc
.
memory_pool
);
auto
gate_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
di
},
rsrc
.
memory_pool
);
auto
up_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
di
},
rsrc
.
memory_pool
);
auto
o_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
nh
*
dh
},
rsrc
.
memory_pool
);
auto
prob_buf
=
Tensor
::
buffer
(
dt_logits
,
{
nreq
,
dvoc
},
rsrc
.
memory_pool
);
auto
result_buf
=
Tensor
::
buffer
(
INFINI_DTYPE_I64
,
{
nreq
},
rsrc
.
memory_pool
);
auto
result_cpu
=
std
::
vector
<
int64_t
>
(
nreq
);
// Prepare inputs
auto
batch_pos_ids
=
std
::
vector
<
uint32_t
>
(
ntok
);
size_t
req_start
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
for
(
uint32_t
i
=
0
;
i
<
req_lens
[
req
];
i
++
)
{
batch_pos_ids
[
req_start
+
i
]
=
req_pos
[
req
]
+
i
;
}
req_start
+=
req_lens
[
req
];
}
std
::
shared_ptr
<
Tensor
>
pos_ids_buf
;
if
(
rsrc
.
device
==
INFINI_DEVICE_CPU
)
{
pos_ids_buf
=
Tensor
::
weight
(
batch_pos_ids
.
data
(),
INFINI_DTYPE_U32
,
{
ntok
});
}
else
{
pos_ids_buf
=
Tensor
::
buffer
(
INFINI_DTYPE_U32
,
{
ntok
},
rsrc
.
memory_pool
);
RUN_INFINI
(
infinirtMemcpyAsync
(
pos_ids_buf
->
data
(),
batch_pos_ids
.
data
(),
sizeof
(
uint32_t
)
*
ntok
,
INFINIRT_MEMCPY_H2D
,
stream
));
}
for
(
uint32_t
i
=
0
;
i
<
ntok
;
i
++
)
{
RUN_INFINI
(
infinirtMemcpyAsync
(
logits_in
->
data
(
i
*
d
),
weight
->
w_in_embd
->
data
(
tokens
[
i
]
*
d
),
dsize
(
dt_logits
)
*
d
,
INFINIRT_MEMCPY_D2D
,
stream
));
}
// Attention
// attention inner
size_t
max_qk_size
=
0
;
size_t
max_seq_len
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
past_len
=
req_pos
[
req
];
auto
seq_len
=
req_lens
[
req
];
auto
total_len
=
past_len
+
seq_len
;
max_qk_size
=
std
::
max
(
max_qk_size
,
size_t
(
seq_len
*
total_len
));
max_seq_len
=
std
::
max
(
max_seq_len
,
size_t
(
seq_len
));
}
auto
qk_buf
=
Tensor
::
buffer
(
dt_logits
,
{
nh
,
max_qk_size
},
rsrc
.
memory_pool
);
auto
rearrange_q_buf
=
Tensor
::
buffer
(
dt_logits
,
{
nkvh
,
ngroup
*
max_seq_len
,
dh
},
rsrc
.
memory_pool
);
auto
q_rearrange
=
rearrange_q_buf
->
view
({
nkvh
,
ngroup
,
max_seq_len
,
dh
});
auto
attn_val_buf
=
Tensor
::
buffer
(
dt_logits
,
{
nkvh
,
ngroup
*
max_seq_len
,
dh
},
rsrc
.
memory_pool
);
auto
attn_val_gemm
=
attn_val_buf
->
view
({
nkvh
,
ngroup
,
max_seq_len
,
dh
});
// Compute
for
(
uint32_t
layer
=
0
;
layer
<
nlayer
;
layer
++
)
{
// 1. Attention
// rms norm
rmsnorm
(
logits_out
,
logits_in
,
weight
->
w_attn_norm
[
layer
],
meta
->
epsilon
);
// qkv_proj
dequant_linear
(
q_buf
,
logits_out
,
weight
->
w_attn_q
[
layer
]
->
w
,
weight
->
w_attn_q
[
layer
]
->
s
,
weight
->
w_attn_q
[
layer
]
->
z
,
1.0
,
0.0
,
nullptr
,
has_qkv_bias
?
weight
->
b_attn_q
[
layer
]
:
nullptr
);
dequant_linear
(
k_buf
,
logits_out
,
weight
->
w_attn_k
[
layer
]
->
w
,
weight
->
w_attn_k
[
layer
]
->
s
,
weight
->
w_attn_k
[
layer
]
->
z
,
1.0
,
0.0
,
nullptr
,
has_qkv_bias
?
weight
->
b_attn_k
[
layer
]
:
nullptr
);
dequant_linear
(
v_buf
,
logits_out
,
weight
->
w_attn_v
[
layer
]
->
w
,
weight
->
w_attn_v
[
layer
]
->
s
,
weight
->
w_attn_v
[
layer
]
->
z
,
1.0
,
0.0
,
nullptr
,
has_qkv_bias
?
weight
->
b_attn_v
[
layer
]
:
nullptr
);
// rope
rope_v2
(
q_buf
->
view
({
ntok
,
nh
,
dh
}),
q_buf
->
view
({
ntok
,
nh
,
dh
}),
pos_ids_buf
,
weight
->
sin_table
,
weight
->
cos_table
);
rope_v2
(
k_buf
->
view
({
ntok
,
nkvh
,
dh
}),
k_buf
->
view
({
ntok
,
nkvh
,
dh
}),
pos_ids_buf
,
weight
->
sin_table
,
weight
->
cos_table
);
size_t
token_offset
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
past_len
=
req_pos
[
req
];
auto
seq_len
=
req_lens
[
req
];
auto
total_len
=
past_len
+
seq_len
;
auto
o
=
o_buf
->
slice
({{
0
,
token_offset
,
seq_len
}})
->
view
({
seq_len
,
nkvh
,
ngroup
,
dh
})
->
permute
({
1
,
2
,
0
,
3
});
auto
q
=
q_buf
->
slice
({{
0
,
token_offset
,
seq_len
}})
->
view
({
seq_len
,
nkvh
,
ngroup
,
dh
})
->
permute
({
1
,
2
,
0
,
3
});
auto
k
=
k_buf
->
slice
({{
0
,
token_offset
,
seq_len
}})
->
view
({
seq_len
,
nkvh
,
dh
});
auto
v
=
v_buf
->
slice
({{
0
,
token_offset
,
seq_len
}})
->
view
({
seq_len
,
nkvh
,
dh
});
// self attention
// concat
rearrange
(
kv_caches
[
req
]
->
k
[
idev
][
layer
]
->
slice
(
0
,
past_len
,
seq_len
),
k
);
rearrange
(
kv_caches
[
req
]
->
v
[
idev
][
layer
]
->
slice
(
0
,
past_len
,
seq_len
),
v
);
// qk
rearrange
(
q_rearrange
->
slice
(
2
,
0
,
seq_len
),
q
);
auto
qk_gemm
=
qk_buf
->
slice
(
1
,
0
,
seq_len
*
total_len
)
->
view
({
nkvh
,
ngroup
*
seq_len
,
total_len
});
auto
k_gemm
=
kv_caches
[
req
]
->
k
[
idev
][
layer
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
2
,
0
});
linear
(
qk_gemm
,
rearrange_q_buf
->
slice
(
1
,
0
,
ngroup
*
seq_len
),
k_gemm
,
1.
f
/
float
(
sqrt
(
dh
)),
0.
f
,
nullptr
,
nullptr
);
// softmax
auto
qk_softmax
=
qk_buf
->
slice
(
1
,
0
,
seq_len
*
total_len
)
->
view
({
nh
,
seq_len
,
total_len
});
causalSoftmax
(
qk_softmax
,
qk_softmax
);
auto
v_gemm
=
kv_caches
[
req
]
->
v
[
idev
][
layer
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
0
,
2
});
linear
(
attn_val_buf
->
slice
(
1
,
0
,
ngroup
*
seq_len
),
qk_gemm
,
v_gemm
,
1.
f
,
0.
f
,
nullptr
,
nullptr
);
// rearrange attn val
rearrange
(
o
,
attn_val_gemm
->
slice
(
2
,
0
,
seq_len
));
token_offset
+=
seq_len
;
}
// o_proj
dequant_linear
(
logits_in
,
o_buf
,
weight
->
w_attn_out
[
layer
]
->
w
,
weight
->
w_attn_out
[
layer
]
->
s
,
weight
->
w_attn_out
[
layer
]
->
z
,
1.0
,
0.0
,
idev
==
0
?
logits_in
:
nullptr
,
nullptr
);
// only rank 0 adds residual
// All_reduce if distributed
if
(
rsrc
.
comm
!=
nullptr
)
{
RUN_INFINI
(
infinicclAllReduce
(
logits_in
->
data
(),
logits_in
->
data
(),
ntok
*
d
,
dt_logits
,
INFINICCL_SUM
,
rsrc
.
comm
,
stream
));
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
}
// 2. FFN
rmsnorm
(
logits_out
,
logits_in
,
weight
->
w_ffn_norm
[
layer
],
meta
->
epsilon
);
dequant_linear
(
gate_buf
,
logits_out
,
weight
->
w_ffn_gate
[
layer
]
->
w
,
weight
->
w_ffn_gate
[
layer
]
->
s
,
weight
->
w_ffn_gate
[
layer
]
->
z
,
1.0
,
0.0
,
nullptr
,
nullptr
);
dequant_linear
(
up_buf
,
logits_out
,
weight
->
w_ffn_up
[
layer
]
->
w
,
weight
->
w_ffn_up
[
layer
]
->
s
,
weight
->
w_ffn_up
[
layer
]
->
z
,
1.0
,
0.0
,
nullptr
,
nullptr
);
swiglu
(
gate_buf
,
up_buf
,
gate_buf
);
dequant_linear
(
logits_in
,
gate_buf
,
weight
->
w_ffn_down
[
layer
]
->
w
,
weight
->
w_ffn_down
[
layer
]
->
s
,
weight
->
w_ffn_down
[
layer
]
->
z
,
1.0
,
0.0
,
idev
==
0
?
logits_in
:
nullptr
,
nullptr
);
// only rank 0 adds residual
// All_reduce if distributed
if
(
rsrc
.
comm
!=
nullptr
)
{
RUN_INFINI
(
infinicclAllReduce
(
logits_in
->
data
(),
logits_in
->
data
(),
ntok
*
d
,
dt_logits
,
INFINICCL_SUM
,
rsrc
.
comm
,
stream
));
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
}
}
// Sample and Output
if
(
idev
==
0
)
{
if
(
last_logits
!=
nullptr
)
{
rmsnorm
(
logits_out
,
logits_in
,
weight
->
w_out_norm
,
meta
->
epsilon
);
auto
last_logits_buf
=
Tensor
::
buffer
(
dt_logits
,
{
ntok
,
dvoc
},
rsrc
.
memory_pool
);
linear
(
last_logits_buf
,
logits_out
,
weight
->
w_out_embd
,
1.0
,
0.0
,
nullptr
,
nullptr
);
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
RUN_INFINI
(
infinirtMemcpy
(
last_logits
,
last_logits_buf
->
data
(),
dsize
(
dt_logits
)
*
ntok
*
dvoc
,
INFINIRT_MEMCPY_D2H
));
}
if
(
output
!=
nullptr
)
{
size_t
token_offset
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
seq_len
=
req_lens
[
req
];
token_offset
+=
seq_len
;
rmsnorm
(
logits_out
->
slice
(
0
,
req
,
1
),
logits_in
->
slice
(
0
,
token_offset
-
1
,
1
),
weight
->
w_out_norm
,
meta
->
epsilon
);
}
linear
(
prob_buf
,
logits_out
->
slice
(
0
,
0
,
nreq
),
weight
->
w_out_embd
,
1.0
,
0.0
,
nullptr
,
nullptr
);
std
::
random_device
_rd
;
std
::
mt19937
gen
(
_rd
());
token_offset
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
seq_len
=
req_lens
[
req
];
float
random_val
=
std
::
uniform_real_distribution
<
float
>
(
0
,
1
)(
gen
);
randomSample
(
result_buf
->
slice
(
0
,
req
,
1
)
->
view_as
({},
{}),
prob_buf
->
slice
(
0
,
req
,
1
)
->
view_as
({
dvoc
},
{
1
}),
random_val
,
topp
[
req
],
topk
[
req
],
temperature
[
req
]);
token_offset
+=
seq_len
;
}
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
RUN_INFINI
(
infinirtMemcpy
(
result_cpu
.
data
(),
result_buf
->
data
(),
sizeof
(
int64_t
)
*
nreq
,
INFINIRT_MEMCPY_D2H
));
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
output
[
req
]
=
uint32_t
(
result_cpu
[
req
]);
}
}
}
}
__C
void
inferBatchJiugeAWQ
(
struct
JiugeAWQModel
*
model
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
KVCache
**
kv_caches
,
const
float
*
temperature
,
const
uint32_t
*
topk
,
const
float
*
topp
,
uint32_t
*
output
)
{
model
->
req
.
tokens
=
tokens
;
model
->
req
.
ntok
=
ntok
;
model
->
req
.
req_lens
=
req_lens
;
model
->
req
.
nreq
=
nreq
;
model
->
req
.
req_pos
=
req_pos
;
model
->
req
.
kv_caches
=
kv_caches
;
model
->
req
.
output
=
output
;
model
->
req
.
logits
=
nullptr
;
model
->
req
.
temperature
=
temperature
;
model
->
req
.
topk
=
topk
;
model
->
req
.
topp
=
topp
;
for
(
size_t
idev
=
0
;
idev
<
model
->
dev_ids
.
size
();
idev
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
proceed
=
true
;
lock
.
unlock
();
model
->
states
[
idev
].
cv_start
.
notify_one
();
}
for
(
size_t
i
=
model
->
dev_ids
.
size
();
i
>
0
;
i
--
)
{
auto
idev
=
i
-
1
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
cv_done
.
wait
(
lock
,
[
&
]
{
return
!
(
model
->
states
[
idev
].
proceed
);
});
lock
.
unlock
();
}
}
__C
void
forwardBatchJiugeAWQ
(
struct
JiugeAWQModel
*
model
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
KVCache
**
kv_caches
,
void
*
logits
)
{
model
->
req
.
tokens
=
tokens
;
model
->
req
.
ntok
=
ntok
;
model
->
req
.
req_lens
=
req_lens
;
model
->
req
.
nreq
=
nreq
;
model
->
req
.
req_pos
=
req_pos
;
model
->
req
.
kv_caches
=
kv_caches
;
model
->
req
.
output
=
nullptr
;
model
->
req
.
logits
=
logits
;
model
->
req
.
temperature
=
nullptr
;
model
->
req
.
topk
=
nullptr
;
model
->
req
.
topp
=
nullptr
;
for
(
size_t
idev
=
0
;
idev
<
model
->
dev_ids
.
size
();
idev
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
proceed
=
true
;
lock
.
unlock
();
model
->
states
[
idev
].
cv_start
.
notify_one
();
}
for
(
size_t
i
=
model
->
dev_ids
.
size
();
i
>
0
;
i
--
)
{
auto
idev
=
i
-
1
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
cv_done
.
wait
(
lock
,
[
&
]
{
return
!
(
model
->
states
[
idev
].
proceed
);
});
lock
.
unlock
();
}
}
void
launchDevice
(
const
JiugeAWQMeta
*
meta
,
std
::
shared_ptr
<
JiugeAWQDeviceWeight
>
weights
,
DeviceResource
*
rsrc
,
InferState
&
state
,
InferRequest
&
req
,
infiniDevice_t
device
,
int
idev
,
int
ndev
,
int
dev_id
,
infinicclComm_t
comm
)
{
// Create Device Resource
createDeviceResource
(
rsrc
,
meta
,
weights
,
device
,
idev
,
ndev
,
dev_id
,
comm
);
CacheManager
cache_manager
(
100
);
InferenceContext
ctx
(
rsrc
->
handle
,
rsrc
->
memory_pool
,
&
cache_manager
,
rsrc
->
stream
);
// Set the inference context for this thread
setInferenceContext
(
&
ctx
);
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
state
.
mtx
);
state
.
loaded
=
true
;
lock
.
unlock
();
state
.
cv_load
.
notify_one
();
}
// Infer Loop
while
(
true
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
state
.
mtx
);
state
.
cv_start
.
wait
(
lock
,
[
&
]
{
return
state
.
proceed
||
state
.
exit_flag
;
});
// quit if exit_flag is set
if
(
state
.
exit_flag
)
{
break
;
}
inferDeviceBatch
(
meta
,
*
rsrc
,
idev
,
ndev
,
req
.
tokens
,
req
.
ntok
,
req
.
req_lens
,
req
.
nreq
,
req
.
req_pos
,
req
.
kv_caches
,
req
.
temperature
,
req
.
topk
,
req
.
topp
,
req
.
output
,
req
.
logits
);
state
.
proceed
=
false
;
lock
.
unlock
();
state
.
cv_done
.
notify_one
();
}
// Clean-Up
releaseDeviceResource
(
*
rsrc
);
setInferenceContext
(
nullptr
);
// Clear the context when done
}
JiugeAWQModel
::
JiugeAWQModel
(
const
JiugeAWQMeta
*
meta
,
const
ModelWeights
*
weights_
)
{
auto
weights
=
(
JiugeAWQWeights
*
)(
weights_
);
device
=
weights
->
device
();
dev_ids
=
weights
->
dev_ids
();
int
ndev
=
int
(
dev_ids
.
size
());
dev_resources
=
std
::
vector
<
DeviceResource
>
(
ndev
);
states
=
std
::
vector
<
InferState
>
(
ndev
);
threads
.
resize
(
ndev
);
auto
comms
=
std
::
vector
<
infinicclComm_t
>
(
ndev
,
nullptr
);
if
(
ndev
>
1
)
{
RUN_INFINI
(
infinicclCommInitAll
(
device
,
comms
.
data
(),
ndev
,
dev_ids
.
data
()));
}
for
(
int
i
=
0
;
i
<
ndev
;
i
++
)
{
threads
[
i
]
=
std
::
thread
(
launchDevice
,
meta
,
weights
->
device_weights
()[
i
],
&
dev_resources
[
i
],
std
::
ref
(
states
[
i
]),
std
::
ref
(
req
),
device
,
i
,
ndev
,
dev_ids
[
i
],
comms
[
i
]);
}
for
(
int
i
=
0
;
i
<
ndev
;
i
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
states
[
i
].
mtx
);
states
[
i
].
cv_load
.
wait
(
lock
,
[
&
]
{
return
states
[
i
].
loaded
;
});
lock
.
unlock
();
}
}
__C
struct
JiugeAWQModel
*
createJiugeAWQModel
(
const
JiugeAWQMeta
*
meta
,
const
ModelWeights
*
weights
)
{
JiugeAWQModel
*
model
=
new
JiugeAWQModel
(
meta
,
weights
);
return
model
;
}
__C
void
destroyJiugeAWQModel
(
struct
JiugeAWQModel
*
model
)
{
auto
ndev
=
model
->
dev_resources
.
size
();
for
(
size_t
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
exit_flag
=
true
;
lock
.
unlock
();
model
->
states
[
idev
].
cv_start
.
notify_one
();
}
for
(
size_t
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
model
->
threads
[
idev
].
join
();
}
delete
model
;
}
src/models/jiuge_awq/jiuge_awq.hpp
0 → 100644
View file @
22804eaa
#pragma once
#include "infinicore_infer/models/jiuge_awq.h"
#include "../../cache.hpp"
#include "../../dataloader/weights_loader.hpp"
#include <condition_variable>
#include <mutex>
#include <thread>
struct
QuantInt4Weight
{
std
::
shared_ptr
<
Tensor
>
w
,
s
,
z
;
};
struct
JiugeAWQDeviceWeight
{
std
::
shared_ptr
<
Tensor
>
w_in_embd
,
w_out_norm
,
w_out_embd
,
sin_table
,
cos_table
;
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
w_attn_norm
,
b_attn_q
,
b_attn_k
,
b_attn_v
,
w_ffn_norm
;
std
::
vector
<
std
::
shared_ptr
<
QuantInt4Weight
>>
w_attn_q
,
w_attn_k
,
w_attn_v
,
w_attn_out
,
w_ffn_gate
,
w_ffn_up
,
w_ffn_down
;
};
class
JiugeAWQWeights
:
public
infinicore
::
WeightsLoader
{
private:
std
::
vector
<
std
::
shared_ptr
<
JiugeAWQDeviceWeight
>>
_device_weights
;
public:
JiugeAWQWeights
(
const
JiugeAWQMeta
*
meta
,
infiniDevice_t
device
,
const
std
::
vector
<
int
>
&
dev_ids
);
std
::
vector
<
std
::
shared_ptr
<
JiugeAWQDeviceWeight
>>
&
device_weights
()
{
return
_device_weights
;
}
};
struct
DeviceResource
{
// Device
infiniDevice_t
device
;
int
device_id
;
infiniopHandle_t
handle
;
// Weights
std
::
shared_ptr
<
JiugeAWQDeviceWeight
>
weights
;
// Streams
infinirtStream_t
stream
;
// Communicator
infinicclComm_t
comm
;
std
::
shared_ptr
<
MemoryPool
>
memory_pool
;
};
struct
InferRequest
{
const
uint32_t
*
tokens
;
uint32_t
ntok
;
const
uint32_t
*
req_lens
;
uint32_t
nreq
;
const
uint32_t
*
req_pos
;
struct
KVCache
**
kv_caches
;
const
float
*
temperature
;
const
uint32_t
*
topk
;
const
float
*
topp
;
uint32_t
*
output
;
void
*
logits
;
};
struct
InferState
{
std
::
mutex
mtx
;
std
::
condition_variable
cv_load
,
cv_start
,
cv_done
;
bool
loaded
=
false
;
bool
proceed
=
false
;
bool
exit_flag
=
false
;
};
struct
JiugeAWQModel
{
JiugeAWQMeta
meta
;
infiniDevice_t
device
;
std
::
vector
<
int
>
dev_ids
;
std
::
vector
<
DeviceResource
>
dev_resources
;
std
::
vector
<
InferState
>
states
;
std
::
vector
<
std
::
thread
>
threads
;
InferRequest
req
;
JiugeAWQModel
(
const
JiugeAWQMeta
*
,
const
ModelWeights
*
);
};
\ No newline at end of file
src/models/jiuge_awq/jiuge_awq_weight.cpp
0 → 100644
View file @
22804eaa
#include "jiuge_awq.hpp"
#include <cmath>
inline
std
::
shared_ptr
<
Tensor
>
getSinTable
(
size_t
dctx
,
size_t
dh
,
float
theta
)
{
auto
half_dh
=
dh
/
2
;
auto
unit
=
dsize
(
INFINI_DTYPE_F16
);
void
*
table
=
std
::
malloc
(
dctx
*
half_dh
*
unit
);
for
(
size_t
i
=
0
;
i
<
dctx
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
half_dh
;
j
++
)
{
float
_sin
=
std
::
sin
(
static_cast
<
float
>
(
i
)
/
std
::
pow
(
theta
,
static_cast
<
float
>
(
j
)
/
half_dh
));
((
uint16_t
*
)
table
)[
i
*
half_dh
+
j
]
=
f32_to_f16
(
_sin
);
}
}
auto
shape
=
std
::
vector
<
size_t
>
({
dctx
,
half_dh
});
auto
tensor
=
Tensor
::
weight
(
table
,
INFINI_DTYPE_F16
,
shape
);
std
::
free
(
table
);
return
tensor
;
}
inline
std
::
shared_ptr
<
Tensor
>
getCosTable
(
size_t
dctx
,
size_t
dh
,
float
theta
)
{
auto
half_dh
=
dh
/
2
;
auto
unit
=
dsize
(
INFINI_DTYPE_F16
);
void
*
table
=
std
::
malloc
(
dctx
*
half_dh
*
unit
);
for
(
size_t
i
=
0
;
i
<
dctx
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
half_dh
;
j
++
)
{
float
_cos
=
std
::
cos
(
static_cast
<
float
>
(
i
)
/
std
::
pow
(
theta
,
static_cast
<
float
>
(
j
)
/
half_dh
));
((
uint16_t
*
)
table
)[
i
*
half_dh
+
j
]
=
f32_to_f16
(
_cos
);
}
}
auto
shape
=
std
::
vector
<
size_t
>
({
dctx
,
half_dh
});
auto
tensor
=
Tensor
::
weight
(
table
,
INFINI_DTYPE_F16
,
shape
);
std
::
free
(
table
);
return
tensor
;
}
JiugeAWQWeights
::
JiugeAWQWeights
(
const
JiugeAWQMeta
*
meta
,
infiniDevice_t
device
,
const
std
::
vector
<
int
>
&
dev_ids
)
:
infinicore
::
WeightsLoader
(
device
,
dev_ids
)
{
auto
ndev
=
dev_ids
.
size
();
_device_weights
.
resize
(
ndev
);
infiniDtype_t
dt_logits
=
meta
->
dt_logits
;
infiniDtype_t
dt_norm_w
=
meta
->
dt_norm_w
;
size_t
nlayer
=
meta
->
nlayer
;
size_t
d
=
meta
->
d
;
size_t
nh
=
meta
->
nh
/
ndev
;
size_t
nkvh
=
meta
->
nkvh
/
ndev
;
size_t
dh
=
meta
->
dh
;
size_t
di
=
meta
->
di
/
ndev
;
size_t
dctx
=
meta
->
dctx
;
size_t
dvoc
=
meta
->
dvoc
;
size_t
nbit
=
meta
->
nbit
;
size_t
quant_group_size
=
meta
->
quant_group_size
;
for
(
size_t
i
=
0
;
i
<
ndev
;
i
++
)
{
RUN_INFINI
(
infinirtSetDevice
(
device
,
dev_ids
[
i
]));
auto
weight
=
std
::
make_shared
<
JiugeAWQDeviceWeight
>
();
_device_weights
[
i
]
=
weight
;
auto
w_in_embd
=
Tensor
::
weight
(
nullptr
,
dt_logits
,
{
dvoc
,
d
});
this
->
resigter
(
"model.embed_tokens.weight"
,
w_in_embd
,
i
);
weight
->
w_in_embd
=
w_in_embd
;
auto
w_out_norm
=
Tensor
::
weight
(
nullptr
,
dt_norm_w
,
{
d
});
this
->
resigter
(
"model.norm.weight"
,
w_out_norm
,
i
);
weight
->
w_out_norm
=
w_out_norm
;
auto
w_out_embd
=
Tensor
::
weight
(
nullptr
,
dt_logits
,
{
dvoc
,
d
})
->
permute
({
1
,
0
});
this
->
resigter
(
"lm_head.weight"
,
w_out_embd
,
i
);
weight
->
w_out_embd
=
w_out_embd
;
weight
->
sin_table
=
getSinTable
(
dctx
,
dh
,
meta
->
theta
);
weight
->
cos_table
=
getCosTable
(
dctx
,
dh
,
meta
->
theta
);
for
(
size_t
layer
=
0
;
layer
<
nlayer
;
layer
++
)
{
#define RIGISTER_LAYER_WEIGHT(W_NAME, W_VAR, W_SHAPE, W_DTYPE) \
auto W_VAR = Tensor::weight(nullptr, W_DTYPE, W_SHAPE); \
this->resigter(W_NAME, W_VAR, i); \
weight->W_VAR.push_back(W_VAR);
RIGISTER_LAYER_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".input_layernorm.weight"
,
w_attn_norm
,
{
d
},
dt_norm_w
);
#define REGISTER_LAYER_QUANT_WEIGHT(W_NAME, W_VAR, W_IN, W_OUT) \
auto W_VAR = std::make_shared<QuantInt4Weight>(); \
W_VAR->w = Tensor::weight(nullptr, INFINI_DTYPE_I32, {W_IN, (W_OUT)*nbit / 32}); \
this->resigter(W_NAME + ".qweight", W_VAR->w, i); \
W_VAR->s = Tensor::weight(nullptr, INFINI_DTYPE_F16, {(W_IN) / quant_group_size, (W_OUT)}); \
this->resigter(W_NAME + ".scales", W_VAR->s, i); \
W_VAR->z = Tensor::weight(nullptr, INFINI_DTYPE_I32, {(W_IN) / quant_group_size, (W_OUT)*nbit / 32}); \
this->resigter(W_NAME + ".qzeros", W_VAR->z, i); \
weight->W_VAR.push_back(W_VAR);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.q_proj"
,
w_attn_q
,
d
,
nh
*
dh
);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.k_proj"
,
w_attn_k
,
d
,
nkvh
*
dh
);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.v_proj"
,
w_attn_v
,
d
,
nkvh
*
dh
);
RIGISTER_LAYER_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.q_proj.bias"
,
b_attn_q
,
{
nh
*
dh
},
INFINI_DTYPE_F16
);
RIGISTER_LAYER_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.k_proj.bias"
,
b_attn_k
,
{
nkvh
*
dh
},
INFINI_DTYPE_F16
);
RIGISTER_LAYER_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.v_proj.bias"
,
b_attn_v
,
{
nkvh
*
dh
},
INFINI_DTYPE_F16
);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".self_attn.o_proj"
,
w_attn_out
,
nh
*
dh
,
d
);
RIGISTER_LAYER_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".post_attention_layernorm.weight"
,
w_ffn_norm
,
{
d
},
dt_norm_w
);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".mlp.gate_proj"
,
w_ffn_gate
,
d
,
di
);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".mlp.up_proj"
,
w_ffn_up
,
d
,
di
);
REGISTER_LAYER_QUANT_WEIGHT
(
"model.layers."
+
std
::
to_string
(
layer
)
+
".mlp.down_proj"
,
w_ffn_down
,
di
,
d
);
}
}
#undef RIGISTER_LAYER_WEIGHT
#undef REGISTER_LAYER_QUANT_WEIGHT
}
__C
struct
ModelWeights
*
createJiugeAWQWeights
(
const
JiugeAWQMeta
*
meta
,
infiniDevice_t
device
,
int
ndev
,
const
int
*
dev_ids
)
{
JiugeAWQWeights
*
weights
=
new
JiugeAWQWeights
(
meta
,
device
,
std
::
vector
<
int
>
(
dev_ids
,
dev_ids
+
ndev
));
return
(
struct
ModelWeights
*
)
weights
;
}
src/tensor.hpp
View file @
22804eaa
...
...
@@ -2,7 +2,6 @@
#define INFER_TENSOR_H
#include "allocator.hpp"
#include "infinicore_infer.h"
#include "utils.hpp"
#include <memory>
#include <string>
...
...
@@ -101,6 +100,7 @@ public:
static
std
::
shared_ptr
<
Tensor
>
weight
(
void
*
host_data
,
infiniDtype_t
dtype
,
const
std
::
vector
<
size_t
>
&
shape
);
void
load
(
const
void
*
host_data
,
infinirtStream_t
stream
=
nullptr
);
std
::
shared_ptr
<
Tensor
>
memShare
(
const
std
::
vector
<
size_t
>
&
shape
,
infiniDtype_t
dtype
=
INFINI_DTYPE_INVALID
)
const
;
std
::
shared_ptr
<
Tensor
>
slice
(
size_t
dim
,
size_t
start
,
size_t
len
);
...
...
@@ -126,6 +126,7 @@ public:
ptrdiff_t
dataOffset
()
const
;
infiniDevice_t
deviceType
()
const
;
int
deviceId
()
const
;
size_t
numel
()
const
;
void
debug
(
const
std
::
string
&
filename
)
const
;
void
debug
()
const
;
...
...
src/tensor/tensor.cpp
View file @
22804eaa
...
...
@@ -113,6 +113,10 @@ infiniDevice_t Tensor::deviceType() const { return this->_storage->deviceType();
int
Tensor
::
deviceId
()
const
{
return
this
->
_storage
->
deviceId
();
}
Tensor
::~
Tensor
()
{}
size_t
Tensor
::
numel
()
const
{
return
std
::
accumulate
(
this
->
shape
().
begin
(),
this
->
shape
().
end
(),
size_t
(
1
),
std
::
multiplies
<
size_t
>
());
}
ptrdiff_t
Tensor
::
dataOffset
()
const
{
return
_offset
;
}
...
...
@@ -154,16 +158,26 @@ std::shared_ptr<Tensor> Tensor::weight(void *data, infiniDtype_t dtype,
tensor
->
_storage
=
Storage
::
create
(
size
);
tensor
->
_desc
=
TensorDesc
::
create
(
dtype
,
shape
,
strides
);
if
(
data
!=
nullptr
)
{
tensor
->
load
(
data
);
}
tensor
->
_offset
=
0
;
return
tensor
;
}
void
Tensor
::
load
(
const
void
*
data
,
infinirtStream_t
stream
)
{
if
(
stream
)
{
RUN_INFINI
(
infinirtMemcpyAsync
(
this
->
_storage
->
memory
(),
data
,
this
->
_storage
->
size
(),
INFINIRT_MEMCPY_H2D
,
stream
));
return
;
}
// NOTE: 为兼容部分平台(沐曦)多线程并发对同一host数据执行memcpy卡死问题
static
std
::
mutex
mutex
;
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex
);
RUN_INFINI
(
infinirtMemcpy
(
t
ensor
->
_storage
->
memory
(),
data
,
size
,
INFINIRT_MEMCPY_H2D
));
RUN_INFINI
(
infinirtMemcpy
(
t
his
->
_storage
->
memory
(),
data
,
this
->
_storage
->
size
()
,
INFINIRT_MEMCPY_H2D
));
}
tensor
->
_offset
=
0
;
return
tensor
;
}
std
::
shared_ptr
<
Tensor
>
Tensor
::
memShare
(
const
std
::
vector
<
size_t
>
&
shape
,
infiniDtype_t
dtype_
)
const
{
...
...
xmake.lua
View file @
22804eaa
...
...
@@ -16,6 +16,8 @@ target("infinicore_infer")
add_files
(
"src/models/*/*.cpp"
)
add_files
(
"src/tensor/*.cpp"
)
add_files
(
"src/allocator/*.cpp"
)
add_files
(
"src/dataloader/*.cpp"
)
add_files
(
"src/cache_manager/*.cpp"
)
add_includedirs
(
"include"
)
set_installdir
(
INFINI_ROOT
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment