Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
cfe4b1a8
Unverified
Commit
cfe4b1a8
authored
Mar 16, 2026
by
thatPepe
Committed by
GitHub
Mar 16, 2026
Browse files
Merge pull request #267 from InfiniTensor/issue/263_T2-1-4
【比赛2025秋】T2-1-4 qwen3vl
parents
66bfd282
b1f6af34
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
3133 additions
and
2 deletions
+3133
-2
include/infinicore_infer.h
include/infinicore_infer.h
+4
-0
include/infinicore_infer/models/qwen3vl.h
include/infinicore_infer/models/qwen3vl.h
+203
-0
scripts/libinfinicore_infer/__init__.py
scripts/libinfinicore_infer/__init__.py
+21
-0
scripts/libinfinicore_infer/qwen3vl.py
scripts/libinfinicore_infer/qwen3vl.py
+329
-0
scripts/qwen3vl.py
scripts/qwen3vl.py
+935
-0
src/allocator.hpp
src/allocator.hpp
+1
-1
src/cache_manager/opcache_manager.hpp
src/cache_manager/opcache_manager.hpp
+6
-0
src/models/inference_context.cpp
src/models/inference_context.cpp
+75
-0
src/models/inference_context.hpp
src/models/inference_context.hpp
+23
-0
src/models/qwen3vl/qwen3vl.cpp
src/models/qwen3vl/qwen3vl.cpp
+715
-0
src/models/qwen3vl/qwen3vl_cache.cpp
src/models/qwen3vl/qwen3vl_cache.cpp
+43
-0
src/models/qwen3vl/qwen3vl_impl.hpp
src/models/qwen3vl/qwen3vl_impl.hpp
+141
-0
src/models/qwen3vl/qwen3vl_weight.cpp
src/models/qwen3vl/qwen3vl_weight.cpp
+636
-0
src/tensor/tensor.cpp
src/tensor/tensor.cpp
+1
-1
No files found.
include/infinicore_infer.h
View file @
cfe4b1a8
...
...
@@ -4,7 +4,11 @@
#include "infinicore_infer/cache.h"
#include "infinicore_infer/weights_loader.h"
#include "infinicore_infer/models/deepseek.h"
#include "infinicore_infer/models/jiuge.h"
#include "infinicore_infer/models/jiuge_awq.h"
#include "infinicore_infer/models/qwen3vl.h"
#endif
/* INFINICORE_INFER_H */
include/infinicore_infer/models/qwen3vl.h
0 → 100644
View file @
cfe4b1a8
#ifndef QWEN3VL_WEIGHTS_H
#define QWEN3VL_WEIGHTS_H
#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>
#include <stddef.h>
#include <stdint.h>
struct
Qwen3vlWeights
;
// Function pointer signatures
typedef
void
(
*
qwen3vl_load_global_fn
)(
Qwen3vlWeights
*
,
void
*
cpu_ptr
);
typedef
void
(
*
qwen3vl_load_layer_fn
)(
Qwen3vlWeights
*
,
void
*
cpu_ptr
,
size_t
layer_id
);
// Struct containing all weight loading functions
typedef
struct
{
// Global
qwen3vl_load_global_fn
load_input_embd
;
qwen3vl_load_global_fn
load_output_norm
;
qwen3vl_load_global_fn
load_output_embd
;
// Attention
qwen3vl_load_layer_fn
load_attn_norm
;
qwen3vl_load_layer_fn
load_attn_q_norm
;
qwen3vl_load_layer_fn
load_attn_k_norm
;
qwen3vl_load_layer_fn
load_attn_qkv_proj
;
qwen3vl_load_layer_fn
load_attn_o_proj
;
// MLP
qwen3vl_load_layer_fn
load_mlp_norm
;
qwen3vl_load_layer_fn
load_mlp_gate_up
;
qwen3vl_load_layer_fn
load_mlp_down
;
}
Qwen3vlLangWeightLoader
;
typedef
struct
{
// Patch_embed
qwen3vl_load_global_fn
load_patch_embed_weight
;
qwen3vl_load_global_fn
load_patch_embed_bias
;
qwen3vl_load_global_fn
load_pos_embed_weight
;
// blocks attn
qwen3vl_load_layer_fn
load_attn_proj_weight
;
qwen3vl_load_layer_fn
load_attn_proj_bias
;
qwen3vl_load_layer_fn
load_attn_qkv_weight
;
qwen3vl_load_layer_fn
load_attn_qkv_bias
;
// block mlp
qwen3vl_load_layer_fn
load_mlp_linear_fc1_weight
;
qwen3vl_load_layer_fn
load_mlp_linear_fc1_bias
;
qwen3vl_load_layer_fn
load_mlp_linear_fc2_weight
;
qwen3vl_load_layer_fn
load_mlp_linear_fc2_bias
;
// block norm
qwen3vl_load_layer_fn
load_norm1_weight
;
qwen3vl_load_layer_fn
load_norm1_bias
;
qwen3vl_load_layer_fn
load_norm2_weight
;
qwen3vl_load_layer_fn
load_norm2_bias
;
// deepstack_merger
qwen3vl_load_layer_fn
load_deepstack_merger_linear_fc1_weight
;
qwen3vl_load_layer_fn
load_deepstack_merger_linear_fc1_bias
;
qwen3vl_load_layer_fn
load_deepstack_merger_linear_fc2_weight
;
qwen3vl_load_layer_fn
load_deepstack_merger_linear_fc2_bias
;
qwen3vl_load_layer_fn
load_deepstack_merger_norm_weight
;
qwen3vl_load_layer_fn
load_deepstack_merger_norm_bias
;
// merger
qwen3vl_load_global_fn
load_merger_linear_fc1_weight
;
qwen3vl_load_global_fn
load_merger_linear_fc1_bias
;
qwen3vl_load_global_fn
load_merger_linear_fc2_weight
;
qwen3vl_load_global_fn
load_merger_linear_fc2_bias
;
qwen3vl_load_global_fn
load_merger_norm_weight
;
qwen3vl_load_global_fn
load_merger_norm_bias
;
}
Qwen3vlVisWeightLoader
;
typedef
struct
{
Qwen3vlLangWeightLoader
lang_loader
;
Qwen3vlVisWeightLoader
vis_loader
;
}
Qwen3vlWeightLoader
;
struct
Qwen3vlModel
;
typedef
struct
{
size_t
bos_token_id
;
size_t
eos_token_id
;
size_t
head_dim
;
size_t
hidden_size
;
float
initializer_range
;
size_t
intermediate_size
;
size_t
max_tokens
;
size_t
num_attention_heads
;
size_t
num_hidden_layers
;
size_t
num_key_value_heads
;
float
rms_norm_eps
;
size_t
mrope_section
[
3
];
size_t
rope_theta
;
size_t
vocab_size
;
}
Qwen3vlTextMeta
;
typedef
struct
{
size_t
depth
;
size_t
deepstack_visual_indexes
[
3
];
size_t
hidden_size
;
size_t
in_channels
;
float
initializer_range
;
size_t
intermediate_size
;
size_t
num_heads
;
size_t
num_position_embeddings
;
size_t
out_hidden_size
;
size_t
patch_size
;
size_t
spatial_merge_size
;
size_t
temporal_patch_size
;
}
Qwen3vlVisMeta
;
typedef
struct
{
infiniDtype_t
dtype
;
// INFINI_DTYPE_BF16
Qwen3vlTextMeta
text_meta
;
Qwen3vlVisMeta
vis_meta
;
size_t
image_token_id
;
size_t
video_token_id
;
size_t
vision_end_token_id
;
size_t
vision_start_token_id
;
}
Qwen3vlMeta
;
//////////////////// APIs ///////////////////////
/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__INFINI_C
__export
struct
Qwen3vlModel
*
createQwen3vlModel
(
const
Qwen3vlMeta
*
,
const
Qwen3vlWeights
*
);
__INFINI_C
Qwen3vlWeights
*
createQwen3vlWeights
(
const
Qwen3vlMeta
*
meta
,
infiniDevice_t
device
,
int
ndev
,
const
int
*
dev_ids
,
bool
transpose_weight
);
__INFINI_C
__export
Qwen3vlWeightLoader
*
createQwen3vlWeightLoader
();
/// @brief 销毁模型
__INFINI_C
__export
void
destroyQwen3vlModel
(
struct
Qwen3vlModel
*
);
__INFINI_C
__export
struct
Qwen3vlCache
*
createQwen3vlCache
(
const
struct
Qwen3vlModel
*
);
__INFINI_C
__export
void
dropQwen3vlCache
(
const
struct
Qwen3vlModel
*
,
struct
Qwen3vlCache
*
);
/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
__INFINI_C
__export
void
inferBatchQwen3vl
(
struct
Qwen3vlModel
*
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
void
*
pixel_values
,
uint32_t
total_patches
,
uint32_t
*
image_grid_thw
,
uint32_t
num_images
,
void
*
pixel_values_videos
,
uint32_t
total_patches_videos
,
uint32_t
*
video_grid_thw
,
uint32_t
num_videos
,
uint32_t
patch_features
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
Qwen3vlCache
**
caches
,
const
float
*
temperature
,
const
uint32_t
*
topk
,
const
float
*
topp
,
uint32_t
*
output
);
/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
__INFINI_C
__export
void
forwardBatchQwen3vl
(
struct
Qwen3vlModel
*
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
void
*
pixel_values
,
uint32_t
total_patches
,
uint32_t
*
image_grid_thw
,
uint32_t
num_images
,
void
*
pixel_values_videos
,
uint32_t
total_patches_videos
,
uint32_t
*
video_grid_thw
,
uint32_t
num_videos
,
uint32_t
patch_features
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
Qwen3vlCache
**
caches
,
void
*
logits
);
#endif // QWEN3VL_WEIGHTS_H
scripts/libinfinicore_infer/__init__.py
View file @
cfe4b1a8
...
...
@@ -8,6 +8,17 @@ from .deepseek_v3 import (
DeepSeekV3WeightLoaderCStruct
,
DeepSeekV3CacheCStruct
,
)
from
.qwen3vl
import
(
Qwen3vlModel
,
Qwen3vlMetaCStruct
,
TextMetaCStruct
,
VisMetaCStruct
,
Qwen3vlWeightsCStruct
,
Qwen3vlWeightLoaderCStruct
,
Qwen3vlVisWeightLoaderCStruct
,
Qwen3vlLangWeightLoaderCStruct
,
Qwen3vlCacheCStruct
,
)
__all__
=
[
"DataType"
,
...
...
@@ -23,5 +34,15 @@ __all__ = [
"DeepSeekV3MetaCStruct"
,
"DeepSeekV3WeightsCStruct"
,
"DeepSeekV3WeightLoaderCStruct"
,
"DeepSeekV3CacheCStruct"
,
"Qwen3vlModel"
,
"Qwen3vlMetaCStruct"
,
"TextMetaCStruct"
,
"VisMetaCStruct"
,
"Qwen3vlWeightsCStruct"
,
"Qwen3vlWeightLoaderCStruct"
,
"Qwen3vlVisWeightLoaderCStruct"
,
"Qwen3vlLangWeightLoaderCStruct"
,
"Qwen3vlCacheCStruct"
,
"ModelRegister"
,
]
scripts/libinfinicore_infer/qwen3vl.py
0 → 100644
View file @
cfe4b1a8
from
.base
import
BaseModel
,
DataType
,
DeviceType
,
KVCacheCStruct
,
register_model
from
ctypes
import
(
c_size_t
,
c_uint
,
c_uint16
,
c_int
,
c_float
,
c_void_p
,
c_bool
,
POINTER
,
Structure
,
CFUNCTYPE
,
)
class
TextMetaCStruct
(
Structure
):
_fields_
=
[
(
"bos_token_id"
,
c_size_t
),
(
"eos_token_id"
,
c_size_t
),
(
"head_dim"
,
c_size_t
),
(
"hidden_size"
,
c_size_t
),
(
"initializer_range"
,
c_float
),
(
"_pad1"
,
c_float
),
(
"intermediate_size"
,
c_size_t
),
(
"max_tokens"
,
c_size_t
),
(
"num_attention_heads"
,
c_size_t
),
(
"num_hidden_layers"
,
c_size_t
),
(
"num_key_value_heads"
,
c_size_t
),
(
"rms_norm_eps"
,
c_float
),
(
"_pad2"
,
c_float
),
(
"mrope_section"
,
c_size_t
*
3
),
(
"rope_theta"
,
c_size_t
),
(
"vocab_size"
,
c_size_t
),
]
class
VisMetaCStruct
(
Structure
):
_fields_
=
[
(
"depth"
,
c_size_t
),
(
"deepstack_visual_indexes"
,
c_size_t
*
3
),
(
"hidden_size"
,
c_size_t
),
(
"in_channels"
,
c_size_t
),
(
"initializer_range"
,
c_float
),
(
"_pad1"
,
c_float
),
(
"intermediate_size"
,
c_size_t
),
(
"num_heads"
,
c_size_t
),
(
"num_position_embeddings"
,
c_size_t
),
(
"out_hidden_size"
,
c_size_t
),
(
"patch_size"
,
c_size_t
),
(
"spatial_merge_size"
,
c_size_t
),
(
"temporal_patch_size"
,
c_size_t
),
]
class
Qwen3vlMetaCStruct
(
Structure
):
_fields_
=
[
(
"dtype"
,
DataType
),
(
"_pad_dtype"
,
c_uint
),
(
"text_meta"
,
TextMetaCStruct
),
(
"vis_meta"
,
VisMetaCStruct
),
# Token ids
(
"image_token_id"
,
c_size_t
),
(
"video_token_id"
,
c_size_t
),
(
"vision_end_token_id"
,
c_size_t
),
(
"vision_start_token_id"
,
c_size_t
),
]
class
Qwen3vlWeightsCStruct
(
Structure
):
pass
class
Qwen3vlModelCStruct
(
Structure
):
pass
class
Qwen3vlCacheCStruct
(
Structure
):
pass
load_global_fn
=
CFUNCTYPE
(
None
,
POINTER
(
Qwen3vlWeightsCStruct
),
c_void_p
)
load_layer_fn
=
CFUNCTYPE
(
None
,
POINTER
(
Qwen3vlWeightsCStruct
),
c_void_p
,
c_size_t
)
class
Qwen3vlLangWeightLoaderCStruct
(
Structure
):
_fields_
=
[
# Global
(
"load_input_embd"
,
load_global_fn
),
(
"load_output_norm"
,
load_global_fn
),
(
"load_output_embd"
,
load_global_fn
),
# Attention
(
"load_attn_norm"
,
load_layer_fn
),
(
"load_attn_q_norm"
,
load_layer_fn
),
(
"load_attn_k_norm"
,
load_layer_fn
),
(
"load_attn_qkv_proj"
,
load_layer_fn
),
(
"load_attn_o_proj"
,
load_layer_fn
),
# MLP
(
"load_mlp_norm"
,
load_layer_fn
),
(
"load_mlp_gate_up"
,
load_layer_fn
),
(
"load_mlp_down"
,
load_layer_fn
),
]
class
Qwen3vlVisWeightLoaderCStruct
(
Structure
):
_fields_
=
[
# Patch embed
(
"load_patch_embed_weight"
,
load_global_fn
),
(
"load_patch_embed_bias"
,
load_global_fn
),
(
"load_pos_embed_weight"
,
load_global_fn
),
# Blocks attention
(
"load_attn_proj_weight"
,
load_layer_fn
),
(
"load_attn_proj_bias"
,
load_layer_fn
),
(
"load_attn_qkv_weight"
,
load_layer_fn
),
(
"load_attn_qkv_bias"
,
load_layer_fn
),
# Blocks MLP
(
"load_mlp_linear_fc1_weight"
,
load_layer_fn
),
(
"load_mlp_linear_fc1_bias"
,
load_layer_fn
),
(
"load_mlp_linear_fc2_weight"
,
load_layer_fn
),
(
"load_mlp_linear_fc2_bias"
,
load_layer_fn
),
# Blocks norm
(
"load_norm1_weight"
,
load_layer_fn
),
(
"load_norm1_bias"
,
load_layer_fn
),
(
"load_norm2_weight"
,
load_layer_fn
),
(
"load_norm2_bias"
,
load_layer_fn
),
# Deepstack merger
(
"load_deepstack_merger_linear_fc1_weight"
,
load_layer_fn
),
(
"load_deepstack_merger_linear_fc1_bias"
,
load_layer_fn
),
(
"load_deepstack_merger_linear_fc2_weight"
,
load_layer_fn
),
(
"load_deepstack_merger_linear_fc2_bias"
,
load_layer_fn
),
(
"load_deepstack_merger_norm_weight"
,
load_layer_fn
),
(
"load_deepstack_merger_norm_bias"
,
load_layer_fn
),
# Merger
(
"load_merger_linear_fc1_weight"
,
load_global_fn
),
(
"load_merger_linear_fc1_bias"
,
load_global_fn
),
(
"load_merger_linear_fc2_weight"
,
load_global_fn
),
(
"load_merger_linear_fc2_bias"
,
load_global_fn
),
(
"load_merger_norm_weight"
,
load_global_fn
),
(
"load_merger_norm_bias"
,
load_global_fn
),
]
class
Qwen3vlWeightLoaderCStruct
(
Structure
):
_fields_
=
[
(
"lang_loader"
,
Qwen3vlLangWeightLoaderCStruct
),
(
"vis_loader"
,
Qwen3vlVisWeightLoaderCStruct
),
]
@
register_model
class
Qwen3vlModel
(
BaseModel
):
@
classmethod
def
register_lib
(
cls
,
lib
):
"""Register Qwen3vl model functions with the library"""
lib
.
createQwen3vlWeightLoader
.
argtypes
=
[]
lib
.
createQwen3vlWeightLoader
.
restype
=
POINTER
(
Qwen3vlWeightLoaderCStruct
)
lib
.
createQwen3vlWeights
.
argtypes
=
[
POINTER
(
Qwen3vlMetaCStruct
),
DeviceType
,
c_int
,
POINTER
(
c_int
),
c_bool
,
]
lib
.
createQwen3vlWeights
.
restype
=
POINTER
(
Qwen3vlWeightsCStruct
)
lib
.
createQwen3vlModel
.
argtypes
=
[
POINTER
(
Qwen3vlMetaCStruct
),
POINTER
(
Qwen3vlWeightsCStruct
),
]
lib
.
createQwen3vlModel
.
restype
=
POINTER
(
Qwen3vlModelCStruct
)
lib
.
destroyQwen3vlModel
.
argtypes
=
[
POINTER
(
Qwen3vlModelCStruct
)]
lib
.
createQwen3vlCache
.
argtypes
=
[
POINTER
(
Qwen3vlModelCStruct
)]
lib
.
createQwen3vlCache
.
restype
=
POINTER
(
Qwen3vlCacheCStruct
)
lib
.
dropQwen3vlCache
.
argtypes
=
[
POINTER
(
Qwen3vlModelCStruct
),
POINTER
(
Qwen3vlCacheCStruct
),
]
lib
.
inferBatchQwen3vl
.
argtypes
=
[
POINTER
(
Qwen3vlModelCStruct
),
POINTER
(
c_uint
),
c_uint
,
c_void_p
,
# pixel_values,
c_uint
,
# total_patches,
POINTER
(
c_uint
),
# image_grid_thw,
c_uint
,
# num_images,
c_void_p
,
# pixel_values_videos,
c_uint
,
# total_patches_videos,
POINTER
(
c_uint
),
# video_grid_thw,
c_uint
,
# num_videos,
c_uint
,
# patch_features,
POINTER
(
c_uint
),
c_uint
,
POINTER
(
c_uint
),
POINTER
(
POINTER
(
Qwen3vlCacheCStruct
)),
POINTER
(
c_float
),
POINTER
(
c_uint
),
POINTER
(
c_float
),
POINTER
(
c_uint
),
]
lib
.
forwardBatchQwen3vl
.
argtypes
=
[
POINTER
(
Qwen3vlModelCStruct
),
POINTER
(
c_uint
),
c_uint
,
c_void_p
,
# pixel_values,
c_uint
,
# total_patches,
POINTER
(
c_uint
),
# image_grid_thw,
c_uint
,
# num_images,
c_void_p
,
# pixel_values_videos,
c_uint
,
# total_patches_videos,
POINTER
(
c_uint
),
# video_grid_thw,
c_uint
,
# num_videos,
c_uint
,
# patch_features,
POINTER
(
c_uint
),
c_uint
,
POINTER
(
c_uint
),
POINTER
(
POINTER
(
Qwen3vlCacheCStruct
)),
c_void_p
,
]
def
create_weight_loader
(
self
):
return
self
.
lib
.
createQwen3vlWeightLoader
()
def
create_weights
(
self
,
meta
,
device_type
,
ndev
,
dev_ids
,
transpose_weight
):
return
self
.
lib
.
createQwen3vlWeights
(
meta
,
device_type
,
ndev
,
dev_ids
,
transpose_weight
)
def
create_model
(
self
,
meta
,
weights
):
return
self
.
lib
.
createQwen3vlModel
(
meta
,
weights
)
def
destroy_model
(
self
,
model
):
self
.
lib
.
destroyQwen3vlModel
(
model
)
def
create_cache
(
self
,
model
):
return
self
.
lib
.
createQwen3vlCache
(
model
)
def
drop_cache
(
self
,
model
,
cache
):
self
.
lib
.
dropQwen3vlCache
(
model
,
cache
)
def
infer_batch
(
self
,
model
,
tokens
,
ntok
,
pixel_values
,
total_patches
,
image_grid_thw
,
num_images
,
pixel_values_videos
,
total_patches_videos
,
video_grid_thw
,
num_videos
,
patch_features
,
req_lens
,
nreq
,
req_pos
,
caches
,
temperature
,
topk
,
topp
,
output
,
):
self
.
lib
.
inferBatchQwen3vl
(
model
,
tokens
,
ntok
,
pixel_values
,
total_patches
,
image_grid_thw
,
num_images
,
pixel_values_videos
,
total_patches_videos
,
video_grid_thw
,
num_videos
,
patch_features
,
req_lens
,
nreq
,
req_pos
,
caches
,
temperature
,
topk
,
topp
,
output
,
)
def
forward_batch
(
self
,
model
,
tokens
,
ntok
,
pixel_values
,
total_patches
,
image_grid_thw
,
num_images
,
pixel_values_videos
,
total_patches_videos
,
video_grid_thw
,
num_videos
,
patch_features
,
req_lens
,
nreq
,
req_pos
,
caches
,
logits
,
):
self
.
lib
.
forwardBatchQwen3vl
(
model
,
tokens
,
ntok
,
pixel_values
,
total_patches
,
image_grid_thw
,
num_images
,
pixel_values_videos
,
total_patches_videos
,
video_grid_thw
,
num_videos
,
patch_features
,
req_lens
,
nreq
,
req_pos
,
caches
,
logits
,
)
scripts/qwen3vl.py
0 → 100644
View file @
cfe4b1a8
import
ctypes
from
typing
import
List
,
Sequence
from
tqdm
import
tqdm
from
libinfinicore_infer
import
(
Qwen3vlModel
,
Qwen3vlMetaCStruct
,
TextMetaCStruct
,
VisMetaCStruct
,
Qwen3vlWeightsCStruct
,
Qwen3vlCacheCStruct
,
DataType
,
DeviceType
,
)
from
infer_task
import
InferTask
,
KVCache
from
ctypes
import
POINTER
,
c_float
,
c_int
,
c_uint
,
c_uint16
,
c_void_p
,
byref
,
c_bool
import
os
from
pathlib
import
Path
import
safetensors
import
sys
import
time
import
json
import
math
import
torch
import
transformers
torch
.
set_default_device
(
"cpu"
)
class
Qwen3vlLangWeightsNaming
:
def
input_embd
(
self
):
return
"model.language_model.embed_tokens.weight"
def
output_embd
(
self
):
return
"model.language_model.embed_tokens.weight"
def
output_norm
(
self
):
return
"model.language_model.norm.weight"
def
attn_norm
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.input_layernorm.weight"
def
attn_q_proj
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.self_attn.q_proj.weight"
def
attn_q_norm
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.self_attn.q_norm.weight"
def
attn_k_proj
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.self_attn.k_proj.weight"
def
attn_k_norm
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.self_attn.k_norm.weight"
def
attn_o_proj
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.self_attn.o_proj.weight"
def
attn_v_proj
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.self_attn.v_proj.weight"
def
mlp_norm
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.post_attention_layernorm.weight"
def
mlp_gate
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.mlp.gate_proj.weight"
def
mlp_down
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.mlp.down_proj.weight"
def
mlp_up
(
self
,
i
):
return
f
"model.language_model.layers.
{
i
}
.mlp.up_proj.weight"
class
Qwen3vlVisWeightsNaming
:
def
patch_embed_weight
(
self
):
return
"model.visual.patch_embed.proj.weight"
def
patch_embed_bias
(
self
):
return
"model.visual.patch_embed.proj.bias"
def
pos_embed_weight
(
self
):
return
"model.visual.pos_embed.weight"
def
attn_proj_weight
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.attn.proj.weight"
def
attn_proj_bias
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.attn.proj.bias"
def
attn_qkv_weight
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.attn.qkv.weight"
def
attn_qkv_bias
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.attn.qkv.bias"
def
mlp_linear_fc1_weight
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.mlp.linear_fc1.weight"
def
mlp_linear_fc1_bias
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.mlp.linear_fc1.bias"
def
mlp_linear_fc2_weight
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.mlp.linear_fc2.weight"
def
mlp_linear_fc2_bias
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.mlp.linear_fc2.bias"
def
norm1_weight
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.norm1.weight"
def
norm1_bias
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.norm1.bias"
def
norm2_weight
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.norm2.weight"
def
norm2_bias
(
self
,
i
):
return
f
"model.visual.blocks.
{
i
}
.norm2.bias"
def
deepstack_merger_linear_fc1_weight
(
self
,
i
):
return
f
"model.visual.deepstack_merger_list.
{
i
}
.linear_fc1.weight"
def
deepstack_merger_linear_fc1_bias
(
self
,
i
):
return
f
"model.visual.deepstack_merger_list.
{
i
}
.linear_fc1.bias"
def
deepstack_merger_linear_fc2_weight
(
self
,
i
):
return
f
"model.visual.deepstack_merger_list.
{
i
}
.linear_fc2.weight"
def
deepstack_merger_linear_fc2_bias
(
self
,
i
):
return
f
"model.visual.deepstack_merger_list.
{
i
}
.linear_fc2.bias"
def
deepstack_merger_norm_weight
(
self
,
i
):
return
f
"model.visual.deepstack_merger_list.
{
i
}
.norm.weight"
def
deepstack_merger_norm_bias
(
self
,
i
):
return
f
"model.visual.deepstack_merger_list.
{
i
}
.norm.bias"
def
merger_linear_fc1_weight
(
self
):
return
"model.visual.merger.linear_fc1.weight"
def
merger_linear_fc1_bias
(
self
):
return
"model.visual.merger.linear_fc1.bias"
def
merger_linear_fc2_weight
(
self
):
return
"model.visual.merger.linear_fc2.weight"
def
merger_linear_fc2_bias
(
self
):
return
"model.visual.merger.linear_fc2.bias"
def
merger_norm_weight
(
self
):
return
"model.visual.merger.norm.weight"
def
merger_norm_bias
(
self
):
return
"model.visual.merger.norm.bias"
class
Qwen3vlMeta
(
Qwen3vlMetaCStruct
):
def
__init__
(
self
,
config
,
max_tokens
=
None
):
if
config
[
"text_config"
][
"dtype"
]
==
"float16"
:
dt_
=
DataType
.
INFINI_DTYPE_F16
self
.
torch_dtype
=
torch
.
float16
elif
config
[
"text_config"
][
"dtype"
]
==
"float32"
:
dt_
=
DataType
.
INFINI_DTYPE_F32
self
.
torch_dtype
=
torch
.
float32
elif
config
[
"text_config"
][
"dtype"
]
==
"bfloat16"
:
dt_
=
DataType
.
INFINI_DTYPE_BF16
self
.
torch_dtype
=
torch
.
bfloat16
else
:
raise
ValueError
(
f
"Unsupported text dtype:
{
config
[
'text_config'
][
'dtype'
]
}
"
)
super
().
__init__
(
dtype
=
dt_
,
image_token_id
=
config
[
"image_token_id"
],
video_token_id
=
config
[
"video_token_id"
],
vision_end_token_id
=
config
[
"vision_end_token_id"
],
vision_start_token_id
=
config
[
"vision_start_token_id"
],
text_meta
=
TextMetaCStruct
(
bos_token_id
=
config
[
"text_config"
][
"bos_token_id"
],
eos_token_id
=
config
[
"text_config"
][
"eos_token_id"
],
head_dim
=
config
[
"text_config"
][
"head_dim"
],
hidden_size
=
config
[
"text_config"
][
"hidden_size"
],
initializer_range
=
config
[
"text_config"
][
"initializer_range"
],
intermediate_size
=
config
[
"text_config"
][
"intermediate_size"
],
max_tokens
=
(
config
[
"text_config"
][
"max_position_embeddings"
]
if
max_tokens
is
None
else
max_tokens
),
num_attention_heads
=
config
[
"text_config"
][
"num_attention_heads"
],
num_hidden_layers
=
config
[
"text_config"
][
"num_hidden_layers"
],
num_key_value_heads
=
config
[
"text_config"
][
"num_key_value_heads"
],
rms_norm_eps
=
config
[
"text_config"
][
"rms_norm_eps"
],
mrope_section
=
(
ctypes
.
c_ulong
*
3
)(
*
config
[
"text_config"
][
"rope_scaling"
][
"mrope_section"
]
),
rope_theta
=
config
[
"text_config"
][
"rope_theta"
],
vocab_size
=
config
[
"text_config"
][
"vocab_size"
],
),
vis_meta
=
VisMetaCStruct
(
depth
=
config
[
"vision_config"
][
"depth"
],
deepstack_visual_indexes
=
(
ctypes
.
c_ulong
*
3
)(
*
config
[
"vision_config"
][
"deepstack_visual_indexes"
]
),
hidden_size
=
config
[
"vision_config"
][
"hidden_size"
],
in_channels
=
config
[
"vision_config"
][
"in_channels"
],
initializer_range
=
config
[
"vision_config"
][
"initializer_range"
],
intermediate_size
=
config
[
"vision_config"
][
"intermediate_size"
],
num_heads
=
config
[
"vision_config"
][
"num_heads"
],
num_position_embeddings
=
config
[
"vision_config"
][
"num_position_embeddings"
],
out_hidden_size
=
config
[
"vision_config"
][
"out_hidden_size"
],
patch_size
=
config
[
"vision_config"
][
"patch_size"
],
spatial_merge_size
=
config
[
"vision_config"
][
"spatial_merge_size"
],
temporal_patch_size
=
config
[
"vision_config"
][
"temporal_patch_size"
],
),
)
def
load_specific_tensor
(
model_dir
,
tensor_name
):
"""
Load a specific tensor from a safetensors model.
Supports both sharded models (with index.json) and single file models.
"""
# Try to load from individual .safetensors files
safetensors_files
=
[
f
for
f
in
os
.
listdir
(
model_dir
)
if
f
.
endswith
(
".safetensors"
)]
if
not
safetensors_files
:
raise
FileNotFoundError
(
f
"No .safetensors files found in
{
model_dir
}
"
)
# Try to find the tensor in each file
for
filename
in
safetensors_files
:
tensor_file
=
os
.
path
.
join
(
model_dir
,
filename
)
try
:
with
safetensors
.
safe_open
(
tensor_file
,
framework
=
"pt"
,
device
=
"cpu"
)
as
f
:
if
tensor_name
in
f
.
keys
():
tensor
=
f
.
get_tensor
(
tensor_name
)
return
tensor
except
Exception
:
continue
# If we reach here, tensor was not found in any file
raise
KeyError
(
f
"
{
tensor_name
}
not found in any .safetensors files"
)
def
load_Qwen3vl_weights
(
meta
:
Qwen3vlMeta
,
weights
,
model_path
:
str
,
ndev
:
int
,
):
# torch load weights, and reshape for qkv_proj / mlp_gate_up stack, attn / mlp parallel
# weight loader function load from specific offset according to idev, and transpose
model_instance
=
Qwen3vlModel
()
weight_loader
=
model_instance
.
create_weight_loader
()
vis_names
=
Qwen3vlVisWeightsNaming
()
lang_names
=
Qwen3vlLangWeightsNaming
()
nkvh
=
meta
.
text_meta
.
num_key_value_heads
nh
=
meta
.
text_meta
.
num_attention_heads
dh
=
meta
.
text_meta
.
head_dim
d
=
meta
.
text_meta
.
hidden_size
di
=
meta
.
text_meta
.
intermediate_size
assert
nh
%
nkvh
==
0
assert
nh
%
ndev
==
0
assert
nkvh
%
ndev
==
0
assert
di
%
ndev
==
0
# -------------------------------
# Language_model weights
# -------------------------------
input_embd
=
load_specific_tensor
(
model_path
,
lang_names
.
input_embd
()).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_input_embd
(
weights
,
input_embd
.
data_ptr
())
del
input_embd
output_norm
=
load_specific_tensor
(
model_path
,
lang_names
.
output_norm
()).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_output_norm
(
weights
,
output_norm
.
data_ptr
())
del
output_norm
output_embd
=
load_specific_tensor
(
model_path
,
lang_names
.
output_embd
()).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_output_embd
(
weights
,
output_embd
.
data_ptr
())
del
output_embd
for
i
in
range
(
meta
.
text_meta
.
num_hidden_layers
):
attn_norm
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_norm
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_attn_norm
(
weights
,
attn_norm
.
data_ptr
(),
i
)
del
attn_norm
attn_q_proj
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_q_proj
(
i
))
attn_k_proj
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_k_proj
(
i
))
attn_v_proj
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_v_proj
(
i
))
_Q
=
attn_q_proj
.
reshape
(
nh
,
dh
,
d
)
_K
=
attn_k_proj
.
reshape
(
nkvh
,
dh
,
d
)
_V
=
attn_v_proj
.
reshape
(
nkvh
,
dh
,
d
)
qkv_proj
=
[]
_nh
=
nh
//
ndev
_nkvh
=
nkvh
//
ndev
for
_idev
in
range
(
ndev
):
qkv_proj
.
append
(
_Q
[
_idev
*
_nh
:
(
_idev
+
1
)
*
_nh
,
:,
:])
qkv_proj
.
append
(
_K
[
_idev
*
_nkvh
:
(
_idev
+
1
)
*
_nkvh
,
:,
:])
qkv_proj
.
append
(
_V
[
_idev
*
_nkvh
:
(
_idev
+
1
)
*
_nkvh
,
:,
:])
attn_qkv_proj
=
torch
.
cat
(
qkv_proj
,
dim
=
0
).
to
(
meta
.
torch_dtype
).
contiguous
()
weight_loader
.
contents
.
lang_loader
.
load_attn_qkv_proj
(
weights
,
attn_qkv_proj
.
data_ptr
(),
i
)
del
attn_qkv_proj
attn_q_norm
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_q_norm
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_attn_q_norm
(
weights
,
attn_q_norm
.
data_ptr
(),
i
)
del
attn_q_norm
attn_k_norm
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_k_norm
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_attn_k_norm
(
weights
,
attn_k_norm
.
data_ptr
(),
i
)
del
attn_k_norm
attn_o_proj
=
load_specific_tensor
(
model_path
,
lang_names
.
attn_o_proj
(
i
))
attn_o_proj
=
(
attn_o_proj
.
to
(
meta
.
torch_dtype
)
.
reshape
([
d
,
ndev
,
nh
//
ndev
*
dh
])
.
transpose
(
0
,
1
)
.
contiguous
()
)
weight_loader
.
contents
.
lang_loader
.
load_attn_o_proj
(
weights
,
attn_o_proj
.
data_ptr
(),
i
)
del
attn_o_proj
mlp_norm
=
load_specific_tensor
(
model_path
,
lang_names
.
mlp_norm
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
lang_loader
.
load_mlp_norm
(
weights
,
mlp_norm
.
data_ptr
(),
i
)
del
mlp_norm
mlp_gate
=
load_specific_tensor
(
model_path
,
lang_names
.
mlp_gate
(
i
))
mlp_up
=
load_specific_tensor
(
model_path
,
lang_names
.
mlp_up
(
i
))
gate_up
=
[]
_di
=
di
//
ndev
for
_idev
in
range
(
ndev
):
_start
=
_idev
*
_di
_end
=
(
_idev
+
1
)
*
_di
gate_up
.
append
(
mlp_gate
[
_start
:
_end
,
:])
gate_up
.
append
(
mlp_up
[
_start
:
_end
,
:])
mlp_gate_up
=
torch
.
cat
(
gate_up
,
dim
=
0
).
to
(
meta
.
torch_dtype
).
contiguous
()
weight_loader
.
contents
.
lang_loader
.
load_mlp_gate_up
(
weights
,
mlp_gate_up
.
data_ptr
(),
i
)
del
mlp_gate_up
mlp_down
=
load_specific_tensor
(
model_path
,
lang_names
.
mlp_down
(
i
))
mlp_down
=
(
mlp_down
.
to
(
meta
.
torch_dtype
)
.
reshape
([
d
,
ndev
,
di
//
ndev
])
.
transpose
(
0
,
1
)
.
contiguous
()
)
weight_loader
.
contents
.
lang_loader
.
load_mlp_down
(
weights
,
mlp_down
.
data_ptr
(),
i
)
del
mlp_down
# -------------------------------
# Vision head weights
# -------------------------------
patch_embed_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
patch_embed_weight
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_patch_embed_weight
(
weights
,
patch_embed_weight
.
data_ptr
()
)
del
patch_embed_weight
patch_embed_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
patch_embed_bias
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_patch_embed_bias
(
weights
,
patch_embed_bias
.
data_ptr
()
)
del
patch_embed_bias
pos_embed_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
pos_embed_weight
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_pos_embed_weight
(
weights
,
pos_embed_weight
.
data_ptr
()
)
del
pos_embed_weight
for
i
in
range
(
meta
.
vis_meta
.
depth
):
attn_proj_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
attn_proj_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_attn_proj_weight
(
weights
,
attn_proj_weight
.
data_ptr
(),
i
)
del
attn_proj_weight
attn_proj_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
attn_proj_bias
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_attn_proj_bias
(
weights
,
attn_proj_bias
.
data_ptr
(),
i
)
del
attn_proj_bias
attn_qkv_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
attn_qkv_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_attn_qkv_weight
(
weights
,
attn_qkv_weight
.
data_ptr
(),
i
)
del
attn_qkv_weight
attn_qkv_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
attn_qkv_bias
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_attn_qkv_bias
(
weights
,
attn_qkv_bias
.
data_ptr
(),
i
)
del
attn_qkv_bias
mlp_linear_fc1_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
mlp_linear_fc1_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_mlp_linear_fc1_weight
(
weights
,
mlp_linear_fc1_weight
.
data_ptr
(),
i
)
del
mlp_linear_fc1_weight
mlp_linear_fc1_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
mlp_linear_fc1_bias
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_mlp_linear_fc1_bias
(
weights
,
mlp_linear_fc1_bias
.
data_ptr
(),
i
)
del
mlp_linear_fc1_bias
mlp_linear_fc2_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
mlp_linear_fc2_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_mlp_linear_fc2_weight
(
weights
,
mlp_linear_fc2_weight
.
data_ptr
(),
i
)
del
mlp_linear_fc2_weight
mlp_linear_fc2_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
mlp_linear_fc2_bias
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_mlp_linear_fc2_bias
(
weights
,
mlp_linear_fc2_bias
.
data_ptr
(),
i
)
del
mlp_linear_fc2_bias
norm1_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
norm1_weight
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_norm1_weight
(
weights
,
norm1_weight
.
data_ptr
(),
i
)
del
norm1_weight
norm1_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
norm1_bias
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_norm1_bias
(
weights
,
norm1_bias
.
data_ptr
(),
i
)
del
norm1_bias
norm2_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
norm2_weight
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_norm2_weight
(
weights
,
norm2_weight
.
data_ptr
(),
i
)
del
norm2_weight
norm2_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
norm2_bias
(
i
)).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_norm2_bias
(
weights
,
norm2_bias
.
data_ptr
(),
i
)
del
norm2_bias
for
i
in
range
(
len
(
meta
.
vis_meta
.
deepstack_visual_indexes
)):
deepstack_merger_linear_fc1_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
deepstack_merger_linear_fc1_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_deepstack_merger_linear_fc1_weight
(
weights
,
deepstack_merger_linear_fc1_weight
.
data_ptr
(),
i
)
del
deepstack_merger_linear_fc1_weight
deepstack_merger_linear_fc1_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
deepstack_merger_linear_fc1_bias
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_deepstack_merger_linear_fc1_bias
(
weights
,
deepstack_merger_linear_fc1_bias
.
data_ptr
(),
i
)
del
deepstack_merger_linear_fc1_bias
deepstack_merger_linear_fc2_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
deepstack_merger_linear_fc2_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_deepstack_merger_linear_fc2_weight
(
weights
,
deepstack_merger_linear_fc2_weight
.
data_ptr
(),
i
)
del
deepstack_merger_linear_fc2_weight
deepstack_merger_linear_fc2_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
deepstack_merger_linear_fc2_bias
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_deepstack_merger_linear_fc2_bias
(
weights
,
deepstack_merger_linear_fc2_bias
.
data_ptr
(),
i
)
del
deepstack_merger_linear_fc2_bias
deepstack_merger_norm_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
deepstack_merger_norm_weight
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_deepstack_merger_norm_weight
(
weights
,
deepstack_merger_norm_weight
.
data_ptr
(),
i
)
del
deepstack_merger_norm_weight
deepstack_merger_norm_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
deepstack_merger_norm_bias
(
i
)
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_deepstack_merger_norm_bias
(
weights
,
deepstack_merger_norm_bias
.
data_ptr
(),
i
)
del
deepstack_merger_norm_bias
merger_linear_fc1_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
merger_linear_fc1_weight
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_merger_linear_fc1_weight
(
weights
,
merger_linear_fc1_weight
.
data_ptr
()
)
del
merger_linear_fc1_weight
merger_linear_fc1_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
merger_linear_fc1_bias
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_merger_linear_fc1_bias
(
weights
,
merger_linear_fc1_bias
.
data_ptr
()
)
del
merger_linear_fc1_bias
merger_linear_fc2_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
merger_linear_fc2_weight
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_merger_linear_fc2_weight
(
weights
,
merger_linear_fc2_weight
.
data_ptr
()
)
del
merger_linear_fc2_weight
merger_linear_fc2_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
merger_linear_fc2_bias
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_merger_linear_fc2_bias
(
weights
,
merger_linear_fc2_bias
.
data_ptr
()
)
del
merger_linear_fc2_bias
merger_norm_weight
=
load_specific_tensor
(
model_path
,
vis_names
.
merger_norm_weight
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_merger_norm_weight
(
weights
,
merger_norm_weight
.
data_ptr
()
)
del
merger_norm_weight
merger_norm_bias
=
load_specific_tensor
(
model_path
,
vis_names
.
merger_norm_bias
()
).
to
(
meta
.
torch_dtype
)
weight_loader
.
contents
.
vis_loader
.
load_merger_norm_bias
(
weights
,
merger_norm_bias
.
data_ptr
()
)
del
merger_norm_bias
class
Qwen3vlBatchedTask
:
def
__init__
(
self
,
tasks
:
List
[
InferTask
],
all_pixel_values
=
None
,
all_image_grid_thw
=
None
,
all_pixel_values_videos
=
None
,
all_video_grid_thw
=
None
,
):
self
.
tasks
=
tasks
self
.
nreq
=
len
(
tasks
)
# Precompute fields
token_lists
=
[
t
.
tokens
for
t
in
tasks
]
self
.
req_lens_list
=
[
len
(
toks
)
for
toks
in
token_lists
]
self
.
req_pos_list
=
[
t
.
pos
for
t
in
tasks
]
self
.
kv_cache_ptrs
=
[
t
.
kvcache
().
data
()
for
t
in
tasks
]
self
.
temperaturas_list
=
[
t
.
temperature
for
t
in
tasks
]
self
.
topks_list
=
[
t
.
topk
for
t
in
tasks
]
self
.
topps_list
=
[
t
.
topp
for
t
in
tasks
]
# Flatten token lists
flat_tokens
=
[
tok
for
toks
in
token_lists
for
tok
in
toks
]
self
.
ntok
=
len
(
flat_tokens
)
# Convert to ctypes arrays in one pass
self
.
tokens
=
(
c_uint
*
self
.
ntok
)(
*
flat_tokens
)
self
.
req_lens
=
(
c_uint
*
self
.
nreq
)(
*
self
.
req_lens_list
)
self
.
req_pos
=
(
c_uint
*
self
.
nreq
)(
*
self
.
req_pos_list
)
self
.
kv_caches
=
(
POINTER
(
Qwen3vlCacheCStruct
)
*
self
.
nreq
)(
*
self
.
kv_cache_ptrs
)
self
.
temperaturas
=
(
c_float
*
self
.
nreq
)(
*
self
.
temperaturas_list
)
self
.
topks
=
(
c_uint
*
self
.
nreq
)(
*
self
.
topks_list
)
self
.
topps
=
(
c_float
*
self
.
nreq
)(
*
self
.
topps_list
)
# initialize visual encoder inputs
self
.
pixel_values
=
None
self
.
total_patches
=
0
self
.
image_grid_thw
=
None
self
.
num_images
=
0
self
.
pixel_values_videos
=
None
self
.
total_patches_videos
=
0
self
.
video_grid_thw
=
None
self
.
num_videos
=
0
self
.
patch_features
=
0
# Prepare visual encoder inputs
# all_pixel_values = [t.inputs['pixel_values'] for t in tasks if 'pixel_values' in t.inputs]
# all_image_grid_thw = [t.inputs['image_grid_thw'] for t in tasks if 'image_grid_thw' in t.inputs]
# all_pixel_values_videos = [t.inputs['pixel_values_videos'] for t in tasks if 'pixel_values_videos' in t.inputs]
# all_video_grid_thw = [t.inputs['video_grid_thw'] for t in tasks if 'video_grid_thw' in t.inputs]
if
all_pixel_values
is
not
None
:
print
(
all_pixel_values
.
shape
)
concat_pixel_values
=
(
torch
.
cat
(
all_pixel_values
,
dim
=
0
)
if
isinstance
(
all_pixel_values
,
list
)
else
all_pixel_values
)
# (total_patches, features)
self
.
total_patches
=
concat_pixel_values
.
shape
[
0
]
self
.
patch_features
=
concat_pixel_values
.
shape
[
1
]
self
.
flat_pixels
=
(
concat_pixel_values
.
flatten
().
to
(
torch
.
bfloat16
).
contiguous
()
)
self
.
pixel_values
=
self
.
flat_pixels
.
data_ptr
()
if
all_image_grid_thw
is
not
None
:
concat_grid_thw
=
(
torch
.
cat
(
all_image_grid_thw
,
dim
=
0
)
if
isinstance
(
all_image_grid_thw
,
list
)
else
all_image_grid_thw
)
# (total_images, 3)
self
.
num_images
=
concat_grid_thw
.
shape
[
0
]
self
.
flat_grid
=
(
concat_grid_thw
.
flatten
().
to
(
torch
.
int32
).
contiguous
().
tolist
()
)
self
.
image_grid_thw
=
(
c_uint
*
len
(
self
.
flat_grid
))(
*
self
.
flat_grid
)
if
all_pixel_values_videos
is
not
None
:
concat_pixel_values_videos
=
torch
.
cat
(
all_pixel_values_videos
,
dim
=
0
)
# (total_patches_videos, features)
self
.
total_patches_videos
=
concat_pixel_values_videos
.
shape
[
0
]
self
.
patch_features_videos
=
concat_pixel_values_videos
.
shape
[
1
]
print
(
self
.
patch_features_videos
,
flush
=
True
)
self
.
flat_pixels_videos
=
(
concat_pixel_values_videos
.
flatten
().
to
(
torch
.
bfloat16
).
contiguous
()
)
self
.
pixel_values_videos
=
self
.
flat_pixels_videos
.
ctypes
.
data_as
(
c_void_p
)
if
all_video_grid_thw
is
not
None
:
concat_grid_thw_videos
=
torch
.
cat
(
all_video_grid_thw
,
dim
=
0
)
# (total_videos, 3)
self
.
num_videos
=
concat_grid_thw_videos
.
shape
[
0
]
flat_grid_videos
=
(
concat_grid_thw_videos
.
flatten
().
to
(
torch
.
int32
).
contiguous
()
)
self
.
video_grid_thw
=
(
c_uint
*
len
(
flat_grid_videos
))(
*
flat_grid_videos
.
tolist
()
)
def
input_args
(
self
):
return
(
self
.
tokens
,
self
.
ntok
,
self
.
pixel_values
,
self
.
total_patches
,
self
.
image_grid_thw
,
self
.
num_images
,
self
.
pixel_values_videos
,
self
.
total_patches_videos
,
self
.
video_grid_thw
,
self
.
num_videos
,
self
.
patch_features
,
self
.
req_lens
,
self
.
nreq
,
self
.
req_pos
,
self
.
kv_caches
,
self
.
temperaturas
,
self
.
topks
,
self
.
topps
,
)
# 需要处理 visual encoder的cache 和 image video输入
class
Qwen3vlForCauslLM
:
def
__init__
(
self
,
model_dir_path
,
device
=
DeviceType
.
DEVICE_TYPE_CPU
,
ndev
=
1
,
max_tokens
=
None
):
with
open
(
os
.
path
.
join
(
model_dir_path
,
"config.json"
),
"r"
)
as
f
:
config
=
json
.
load
(
f
)
self
.
config
=
config
eos_token_id
=
self
.
config
[
"text_config"
][
"eos_token_id"
]
self
.
eos_token_id
=
(
[
eos_token_id
]
if
type
(
eos_token_id
)
==
int
else
eos_token_id
)
print
(
model_dir_path
)
if
"qwen3_vl"
==
config
[
"model_type"
]:
self
.
meta
=
Qwen3vlMeta
(
config
,
max_tokens
=
max_tokens
)
self
.
processor
=
transformers
.
AutoProcessor
.
from_pretrained
(
model_dir_path
)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
)
else
:
raise
ValueError
(
"Unsupported model architecture"
)
print
(
f
"Creating model on
{
ndev
}
devices..."
)
load_start_time
=
time
.
time
()
dev_ids
=
(
c_int
*
ndev
)(
*
[
i
for
i
in
range
(
ndev
)])
self
.
model_instance
=
Qwen3vlModel
()
weights
=
self
.
model_instance
.
create_weights
(
byref
(
self
.
meta
),
device
,
ndev
,
dev_ids
,
c_bool
(
True
)
)
print
(
"Loading weights..."
)
# Load weights from host
load_Qwen3vl_weights
(
self
.
meta
,
weights
,
model_dir_path
,
ndev
)
# Create model instance
self
.
model_ptr
=
self
.
model_instance
.
create_model
(
byref
(
self
.
meta
),
weights
,
)
load_end_time
=
time
.
time
()
print
(
f
"Time used:
{
load_end_time
-
load_start_time
:.
3
f
}
s"
)
def
max_context_len
(
self
):
return
self
.
meta
.
text_meta
.
max_tokens
def
create_kv_cache
(
self
):
return
self
.
model_instance
.
create_cache
(
self
.
model_ptr
)
def
drop_kv_cache
(
self
,
kv_cache
):
self
.
model_instance
.
drop_cache
(
self
.
model_ptr
,
kv_cache
)
def
batch_infer_one_round
(
self
,
tasks
:
List
[
InferTask
],
all_pixel_values
=
None
,
all_image_grid_thw
=
None
,
all_pixel_values_videos
=
None
,
all_video_grid_thw
=
None
,
):
output
=
(
c_uint
*
len
(
tasks
))()
batch_inputs
=
Qwen3vlBatchedTask
(
tasks
,
all_pixel_values
,
all_image_grid_thw
,
all_pixel_values_videos
,
all_video_grid_thw
,
)
self
.
model_instance
.
infer_batch
(
self
.
model_ptr
,
*
(
batch_inputs
.
input_args
()),
output
,
)
return
list
(
output
)
def
generate
(
self
,
input_content
,
max_steps
=
0
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
inputs
=
self
.
processor
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
input_content
}],
tokenize
=
True
,
add_generation_prompt
=
True
,
return_dict
=
True
,
return_tensors
=
"pt"
,
)
tokens
=
inputs
[
"input_ids"
][
0
].
tolist
()
pixel_values
=
inputs
[
"pixel_values"
]
if
"pixel_values"
in
inputs
else
None
image_grid_thw
=
(
inputs
[
"image_grid_thw"
]
if
"image_grid_thw"
in
inputs
else
None
)
pixel_values_videos
=
(
inputs
[
"pixel_values_videos"
]
if
"pixel_values_videos"
in
inputs
else
None
)
video_grid_thw
=
(
inputs
[
"video_grid_thw"
]
if
"video_grid_thw"
in
inputs
else
None
)
infer_task
=
InferTask
(
0
,
tokens
,
self
.
max_context_len
(),
temperature_
,
topk_
,
topp_
,
self
.
eos_token_id
,
)
infer_task
.
bind_kvcache
(
KVCache
(
self
))
print
(
input_content
)
steps
=
0
total_time
=
0
output_content
=
""
# print(inputs['input_ids'][0].tolist(), flush=True)
for
step_i
in
range
(
max_steps
if
max_steps
>
0
else
self
.
max_context_len
()):
start_time
=
time
.
time
()
output_tokens
=
self
.
batch_infer_one_round
(
[
infer_task
],
pixel_values
,
image_grid_thw
,
pixel_values_videos
,
video_grid_thw
,
)
# print(output_tokens)
end_time
=
time
.
time
()
steps
+=
1
output_str
=
self
.
tokenizer
.
decode
(
output_tokens
[
0
])
output_content
+=
output_str
print
(
output_str
,
end
=
""
,
flush
=
True
)
pixel_values
=
None
image_grid_thw
=
None
pixel_values_videos
=
None
video_grid_thw
=
None
if
output_tokens
[
0
]
in
self
.
eos_token_id
:
break
infer_task
.
next
(
output_tokens
[
0
])
if
step_i
>
0
:
total_time
+=
end_time
-
start_time
print
(
"
\n
"
)
avg_time
=
total_time
*
1000
/
steps
if
steps
>
0
else
-
1
# print(output_content, flush=True)
print
(
f
"Time per step:
{
avg_time
:.
3
f
}
ms"
)
infer_task
.
_kv_cache
.
drop
(
self
)
return
output_content
,
avg_time
def
destroy_model_instance
(
self
):
self
.
model_instance
.
destroy_model
(
self
.
model_ptr
)
print
(
"Model destroyed"
)
def
test
():
if
len
(
sys
.
argv
)
<
3
:
print
(
"Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] <path/to/model_dir> [n_device]"
)
sys
.
exit
(
1
)
model_path
=
sys
.
argv
[
2
]
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
if
sys
.
argv
[
1
]
==
"--cpu"
:
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
elif
sys
.
argv
[
1
]
==
"--nvidia"
:
device_type
=
DeviceType
.
DEVICE_TYPE_NVIDIA
elif
sys
.
argv
[
1
]
==
"--cambricon"
:
device_type
=
DeviceType
.
DEVICE_TYPE_CAMBRICON
elif
sys
.
argv
[
1
]
==
"--ascend"
:
device_type
=
DeviceType
.
DEVICE_TYPE_ASCEND
elif
sys
.
argv
[
1
]
==
"--metax"
:
device_type
=
DeviceType
.
DEVICE_TYPE_METAX
elif
sys
.
argv
[
1
]
==
"--moore"
:
device_type
=
DeviceType
.
DEVICE_TYPE_MOORE
elif
sys
.
argv
[
1
]
==
"--iluvatar"
:
device_type
=
DeviceType
.
DEVICE_TYPE_ILUVATAR
else
:
print
(
"Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] <path/to/model_dir> [n_device]"
)
sys
.
exit
(
1
)
ndev
=
int
(
sys
.
argv
[
3
])
if
len
(
sys
.
argv
)
>
3
else
1
img_url
=
None
if
len
(
sys
.
argv
)
>
4
:
img_url
=
sys
.
argv
[
4
]
model
=
Qwen3vlForCauslLM
(
model_path
,
device_type
,
ndev
,
max_tokens
=
1024
)
input_content
=
(
[
{
"type"
:
"text"
,
"text"
:
"Describe this image."
},
{
"type"
:
"image"
,
"url"
:
img_url
},
]
if
img_url
is
not
None
else
[{
"type"
:
"text"
,
"text"
:
"山东最高的山是?"
}]
)
model
.
generate
(
input_content
)
model
.
destroy_model_instance
()
if
__name__
==
"__main__"
:
test
()
src/allocator.hpp
View file @
cfe4b1a8
...
...
@@ -16,7 +16,7 @@ public:
class
MemoryPool
:
public
AllocatorBase
{
public:
static
constexpr
size_t
DEFAULT_ALIGNMENT
=
256
;
static
constexpr
size_t
DEFAULT_ALIGNMENT
=
512
;
explicit
MemoryPool
(
size_t
initialSize
=
0
,
size_t
alignment
=
DEFAULT_ALIGNMENT
);
~
MemoryPool
();
...
...
src/cache_manager/opcache_manager.hpp
View file @
cfe4b1a8
...
...
@@ -153,6 +153,8 @@ public:
class
CacheManager
{
public:
DECLARE_OP_CACHE
(
Add
)
DECLARE_OP_CACHE
(
Conv
)
DECLARE_OP_CACHE
(
Mul
)
DECLARE_OP_CACHE
(
RMSNorm
)
DECLARE_OP_CACHE
(
Gemm
)
DECLARE_OP_CACHE
(
RoPE
)
...
...
@@ -160,11 +162,14 @@ public:
DECLARE_OP_CACHE
(
CausalSoftmax
)
DECLARE_OP_CACHE
(
Topkrouter
)
DECLARE_OP_CACHE
(
SwiGLU
)
DECLARE_OP_CACHE
(
Silu
)
DECLARE_OP_CACHE
(
RandomSample
)
DECLARE_OP_CACHE
(
DequantizeAWQ
)
CacheManager
(
size_t
capacity
=
100
)
:
Add_cache
(
capacity
,
DESTROY_FUNC
(
Add
)),
Conv_cache
(
capacity
,
DESTROY_FUNC
(
Conv
)),
Mul_cache
(
capacity
,
DESTROY_FUNC
(
Mul
)),
RMSNorm_cache
(
capacity
,
DESTROY_FUNC
(
RMSNorm
)),
Gemm_cache
(
capacity
,
DESTROY_FUNC
(
Gemm
)),
RoPE_cache
(
capacity
,
DESTROY_FUNC
(
RoPE
)),
...
...
@@ -172,6 +177,7 @@ public:
CausalSoftmax_cache
(
capacity
,
DESTROY_FUNC
(
CausalSoftmax
)),
Topkrouter_cache
(
capacity
,
DESTROY_FUNC
(
Topkrouter
)),
SwiGLU_cache
(
capacity
,
DESTROY_FUNC
(
SwiGLU
)),
Silu_cache
(
capacity
,
DESTROY_FUNC
(
Silu
)),
RandomSample_cache
(
capacity
,
DESTROY_FUNC
(
RandomSample
)),
DequantizeAWQ_cache
(
capacity
,
DESTROY_FUNC
(
DequantizeAWQ
))
{}
...
...
src/models/inference_context.cpp
View file @
cfe4b1a8
...
...
@@ -33,6 +33,61 @@ void InferenceContext::add(std::shared_ptr<Tensor> c,
c
->
data
(),
a
->
data
(),
b
->
data
(),
stream
));
}
void
InferenceContext
::
conv
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w
,
std
::
shared_ptr
<
Tensor
>
bias
,
void
*
pads
,
void
*
strides
,
void
*
dilations
,
size_t
n
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
y
,
x
,
w
,
bias
);
// Combine additional parameters into the key for unique identification
hash_combine
(
key
,
std
::
hash
<
void
*>
()(
pads
));
hash_combine
(
key
,
std
::
hash
<
void
*>
()(
strides
));
hash_combine
(
key
,
std
::
hash
<
void
*>
()(
dilations
));
hash_combine
(
key
,
std
::
hash
<
size_t
>
()(
n
));
infiniopConvDescriptor_t
desc
;
if
(
!
cache_manager
->
getConvDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateConvDescriptor
(
op_handle
,
&
desc
,
y
->
desc
(),
x
->
desc
(),
w
->
desc
(),
bias
?
bias
->
desc
()
:
nullptr
,
pads
,
strides
,
dilations
,
n
));
cache_manager
->
putConvDescriptor
(
key
,
desc
);
}
size_t
workspace_size
=
0
;
RUN_INFINI
(
infiniopGetConvWorkspaceSize
(
desc
,
&
workspace_size
));
ensure_workspace
(
workspace_size
);
void
*
workspace
=
workspace_storage
->
memory
();
RUN_INFINI
(
infiniopConv
(
desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
w
->
data
(),
bias
?
bias
->
data
()
:
nullptr
,
stream
));
}
void
InferenceContext
::
mul
(
std
::
shared_ptr
<
Tensor
>
c
,
std
::
shared_ptr
<
Tensor
>
a
,
std
::
shared_ptr
<
Tensor
>
b
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
c
,
a
,
b
);
infiniopMulDescriptor_t
desc
;
if
(
!
cache_manager
->
getMulDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateMulDescriptor
(
op_handle
,
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache_manager
->
putMulDescriptor
(
key
,
desc
);
}
size_t
workspace_size
=
0
;
RUN_INFINI
(
infiniopGetMulWorkspaceSize
(
desc
,
&
workspace_size
));
ensure_workspace
(
workspace_size
);
void
*
workspace
=
workspace_storage
->
memory
();
RUN_INFINI
(
infiniopMul
(
desc
,
workspace
,
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
stream
));
}
void
InferenceContext
::
rmsnorm
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w
,
...
...
@@ -189,6 +244,26 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
out
->
data
(),
up
->
data
(),
gate
->
data
(),
stream
));
}
void
InferenceContext
::
silu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
input
)
{
size_t
key
=
CacheManager
::
createDescriptorKey
(
out
,
input
);
infiniopSiluDescriptor_t
desc
;
if
(
!
cache_manager
->
getSiluDescriptor
(
key
,
desc
))
{
RUN_INFINI
(
infiniopCreateSiluDescriptor
(
op_handle
,
&
desc
,
out
->
desc
(),
input
->
desc
()));
cache_manager
->
putSiluDescriptor
(
key
,
desc
);
}
size_t
workspace_size
=
0
;
RUN_INFINI
(
infiniopGetSiluWorkspaceSize
(
desc
,
&
workspace_size
));
ensure_workspace
(
workspace_size
);
void
*
workspace
=
workspace_storage
->
memory
();
RUN_INFINI
(
infiniopSilu
(
desc
,
workspace
,
workspace_size
,
out
->
data
(),
input
->
data
(),
stream
));
}
void
InferenceContext
::
randomSample
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
prob
,
float
random_val
,
float
top_p
,
uint32_t
top_k
,
float
temperature
)
{
...
...
src/models/inference_context.hpp
View file @
cfe4b1a8
...
...
@@ -19,6 +19,14 @@ struct InferenceContext {
void
add
(
std
::
shared_ptr
<
Tensor
>
c
,
std
::
shared_ptr
<
Tensor
>
a
,
std
::
shared_ptr
<
Tensor
>
b
);
void
conv
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w
,
std
::
shared_ptr
<
Tensor
>
bias
,
void
*
pads
,
void
*
strides
,
void
*
dilations
,
size_t
n
);
void
mul
(
std
::
shared_ptr
<
Tensor
>
c
,
std
::
shared_ptr
<
Tensor
>
a
,
std
::
shared_ptr
<
Tensor
>
b
);
void
rmsnorm
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w
,
...
...
@@ -48,6 +56,8 @@ struct InferenceContext {
void
swiglu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
up
,
std
::
shared_ptr
<
Tensor
>
gate
);
void
silu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
input
);
void
randomSample
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
prob
,
float
random_val
,
float
top_p
,
uint32_t
top_k
,
float
temperature
);
...
...
@@ -81,6 +91,15 @@ inline void add(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::share
getInferenceContext
().
add
(
c
,
a
,
b
);
}
inline
void
conv
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w
,
std
::
shared_ptr
<
Tensor
>
bias
,
void
*
pads
,
void
*
strides
,
void
*
dilations
,
size_t
n
)
{
getInferenceContext
().
conv
(
y
,
x
,
w
,
bias
,
pads
,
strides
,
dilations
,
n
);
}
inline
void
mul
(
std
::
shared_ptr
<
Tensor
>
c
,
std
::
shared_ptr
<
Tensor
>
a
,
std
::
shared_ptr
<
Tensor
>
b
)
{
getInferenceContext
().
mul
(
c
,
a
,
b
);
}
inline
void
rmsnorm
(
std
::
shared_ptr
<
Tensor
>
y
,
std
::
shared_ptr
<
Tensor
>
x
,
std
::
shared_ptr
<
Tensor
>
w
,
float
epsilon
)
{
getInferenceContext
().
rmsnorm
(
y
,
x
,
w
,
epsilon
);
...
...
@@ -131,6 +150,10 @@ inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up,
getInferenceContext
().
swiglu
(
out
,
up
,
gate
);
}
inline
void
silu
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
input
)
{
getInferenceContext
().
silu
(
out
,
input
);
}
inline
void
randomSample
(
std
::
shared_ptr
<
Tensor
>
out
,
std
::
shared_ptr
<
Tensor
>
prob
,
float
random_val
,
float
top_p
,
uint32_t
top_k
,
float
temperature
)
{
getInferenceContext
().
randomSample
(
out
,
prob
,
random_val
,
top_p
,
top_k
,
temperature
);
...
...
src/models/qwen3vl/qwen3vl.cpp
0 → 100644
View file @
cfe4b1a8
#include "qwen3vl_impl.hpp"
#include "../../tensor.hpp"
#include "../../utils.hpp"
#include "../inference_context.hpp"
#include "infinicore_infer.h"
#include <random>
#include <thread>
#include <vector>
void
createDeviceResource
(
Qwen3vlDeviceResource
*
rsrc
,
const
Qwen3vlMeta
*
meta
,
std
::
shared_ptr
<
Qwen3vlDeviceWeights
>
weights
,
infiniDevice_t
device
,
int
idev
,
int
ndev
,
int
dev_id
,
infinicclComm_t
comm
)
{
RUN_INFINI
(
infinirtSetDevice
(
device
,
dev_id
));
RUN_INFINI
(
infinirtStreamSynchronize
(
weights
->
load_stream
));
infiniopHandle_t
handle
;
infiniopCreateHandle
(
&
handle
);
infinirtStream_t
stream
;
infinirtStreamCreate
(
&
stream
);
auto
memory_pool
=
std
::
make_shared
<
MemoryPool
>
();
*
rsrc
=
Qwen3vlDeviceResource
{
device
,
dev_id
,
handle
,
weights
,
stream
,
comm
,
memory_pool
,
};
RUN_INFINI
(
infinirtDeviceSynchronize
());
}
void
releaseDeviceResource
(
Qwen3vlDeviceResource
&
res
)
{
infinirtDeviceSynchronize
();
res
.
weights
.
reset
();
infiniopDestroyHandle
(
res
.
handle
);
res
.
handle
=
nullptr
;
infinirtStreamDestroy
(
res
.
stream
);
res
.
stream
=
nullptr
;
infinicclCommDestroy
(
res
.
comm
);
res
.
comm
=
nullptr
;
}
inline
std
::
shared_ptr
<
Tensor
>
get_custom_SinTable
(
const
Qwen3vlMeta
&
meta
,
std
::
vector
<
std
::
vector
<
uint32_t
>>
&
pos_ids
,
uint32_t
dim
,
size_t
theta
)
{
// pos_ids shape:[seq, dim/2] , pos ids acting on each dim
auto
unit
=
dsize
(
meta
.
dtype
);
auto
half_dim
=
dim
/
2
;
size_t
len
=
pos_ids
.
size
();
void
*
table
=
std
::
malloc
(
len
*
half_dim
*
unit
);
for
(
size_t
i
=
0
;
i
<
len
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
half_dim
;
j
++
)
{
float
_cos
=
std
::
sin
(
static_cast
<
float
>
(
pos_ids
[
i
][
j
])
/
std
::
pow
(
theta
,
static_cast
<
float
>
(
j
)
/
half_dim
));
if
(
meta
.
dtype
==
INFINI_DTYPE_F16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dim
+
j
]
=
f32_to_f16
(
_cos
);
}
else
if
(
meta
.
dtype
==
INFINI_DTYPE_BF16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dim
+
j
]
=
f32_to_bf16
(
_cos
);
}
else
if
(
meta
.
dtype
==
INFINI_DTYPE_F32
)
{
((
float
*
)
table
)[
i
*
half_dim
+
j
]
=
_cos
;
}
else
{
std
::
cout
<<
"unsupported data type"
<<
std
::
endl
;
exit
(
1
);
}
}
}
auto
shape
=
std
::
vector
<
size_t
>
({
len
,
half_dim
});
auto
tensor
=
Tensor
::
weight
(
table
,
meta
.
dtype
,
shape
);
std
::
free
(
table
);
return
tensor
;
}
inline
std
::
shared_ptr
<
Tensor
>
get_custom_CosTable
(
const
Qwen3vlMeta
&
meta
,
std
::
vector
<
std
::
vector
<
uint32_t
>>
&
pos_ids
,
uint32_t
dim
,
size_t
theta
)
{
// pos_ids shape:[seq, dim/2] , pos ids acting on each dim
auto
unit
=
dsize
(
meta
.
dtype
);
auto
half_dim
=
dim
/
2
;
size_t
len
=
pos_ids
.
size
();
void
*
table
=
std
::
malloc
(
len
*
half_dim
*
unit
);
for
(
size_t
i
=
0
;
i
<
len
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
half_dim
;
j
++
)
{
float
_cos
=
std
::
cos
(
static_cast
<
float
>
(
pos_ids
[
i
][
j
])
/
std
::
pow
(
theta
,
static_cast
<
float
>
(
j
)
/
half_dim
));
if
(
meta
.
dtype
==
INFINI_DTYPE_F16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dim
+
j
]
=
f32_to_f16
(
_cos
);
}
else
if
(
meta
.
dtype
==
INFINI_DTYPE_BF16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dim
+
j
]
=
f32_to_bf16
(
_cos
);
}
else
if
(
meta
.
dtype
==
INFINI_DTYPE_F32
)
{
((
float
*
)
table
)[
i
*
half_dim
+
j
]
=
_cos
;
}
else
{
std
::
cout
<<
"unsupported data type"
<<
std
::
endl
;
exit
(
1
);
}
}
}
auto
shape
=
std
::
vector
<
size_t
>
({
len
,
half_dim
});
auto
tensor
=
Tensor
::
weight
(
table
,
meta
.
dtype
,
shape
);
std
::
free
(
table
);
return
tensor
;
}
inline
std
::
shared_ptr
<
Tensor
>
fast_pos_embed_interpolate
(
const
Qwen3vlMeta
&
meta
,
Qwen3vlDeviceResource
&
rsrc
,
uint32_t
*
grid_thw
,
uint32_t
num_batch
,
uint32_t
total_patches
)
{
auto
dtype
=
meta
.
dtype
;
auto
num_position_embeddings
=
meta
.
vis_meta
.
num_position_embeddings
;
auto
hidden_size
=
meta
.
vis_meta
.
hidden_size
;
auto
merge_size
=
meta
.
vis_meta
.
spatial_merge_size
;
auto
num_grid_per_side
=
static_cast
<
uint32_t
>
(
sqrt
(
num_position_embeddings
));
uint32_t
total_pixels_offset
=
0
;
std
::
shared_ptr
<
Tensor
>
patch_pos_embeds
=
Tensor
::
buffer
(
dtype
,
{
total_patches
,
hidden_size
},
rsrc
.
memory_pool
);
auto
pos_embed_weight
=
rsrc
.
weights
->
w_vis
->
pos_embed_weight
;
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
pos_embeds
(
4
);
for
(
uint32_t
i
=
0
;
i
<
num_batch
;
++
i
)
{
uint32_t
t
=
grid_thw
[
i
*
3
];
uint32_t
h
=
grid_thw
[
i
*
3
+
1
];
uint32_t
w
=
grid_thw
[
i
*
3
+
2
];
auto
weight_array
=
std
::
vector
<
uint16_t
>
(
h
*
w
*
hidden_size
);
auto
weight_tensor
=
Tensor
::
buffer
(
dtype
,
{
h
*
w
,
hidden_size
},
rsrc
.
memory_pool
);
// 计算插值索引和权重
std
::
vector
<
std
::
vector
<
uint32_t
>>
indices
(
4
);
std
::
vector
<
std
::
vector
<
float
>>
weights
(
4
);
auto
linspace
=
[](
float
start
,
float
end
,
uint32_t
num_points
)
->
std
::
vector
<
float
>
{
std
::
vector
<
float
>
res
(
num_points
);
for
(
uint32_t
i
=
0
;
i
<
num_points
;
++
i
)
{
res
[
i
]
=
start
+
(
end
-
start
)
*
i
/
(
num_points
-
1
);
}
return
res
;
};
auto
h_idxs
=
linspace
(
0
,
num_grid_per_side
-
1
,
h
);
auto
w_idxs
=
linspace
(
0
,
num_grid_per_side
-
1
,
w
);
for
(
uint32_t
ih
=
0
;
ih
<
h
;
++
ih
)
{
for
(
uint32_t
iw
=
0
;
iw
<
w
;
++
iw
)
{
float
h_idx_f
=
h_idxs
[
ih
],
w_idx_f
=
w_idxs
[
iw
];
uint32_t
h_idx_floor
=
static_cast
<
uint32_t
>
(
floor
(
h_idx_f
)),
w_idx_floor
=
static_cast
<
uint32_t
>
(
floor
(
w_idx_f
));
uint32_t
h_idx_ceil
=
std
::
min
(
static_cast
<
uint32_t
>
(
ceil
(
h_idx_f
)),
num_grid_per_side
-
1
),
w_idx_ceil
=
std
::
min
(
static_cast
<
uint32_t
>
(
ceil
(
w_idx_f
)),
num_grid_per_side
-
1
);
float
dh
=
h_idx_f
-
h_idx_floor
,
dw
=
w_idx_f
-
w_idx_floor
;
indices
[
0
].
push_back
((
h_idx_floor
*
num_grid_per_side
)
+
w_idx_floor
);
indices
[
1
].
push_back
((
h_idx_floor
*
num_grid_per_side
)
+
w_idx_ceil
);
indices
[
2
].
push_back
((
h_idx_ceil
*
num_grid_per_side
)
+
w_idx_floor
);
indices
[
3
].
push_back
((
h_idx_ceil
*
num_grid_per_side
)
+
w_idx_ceil
);
weights
[
0
].
push_back
((
1
-
dh
)
*
(
1
-
dw
));
weights
[
1
].
push_back
((
1
-
dh
)
*
dw
);
weights
[
2
].
push_back
(
dh
*
(
1
-
dw
));
weights
[
3
].
push_back
(
dh
*
dw
);
}
}
// 查表并加权求和
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
pos_embeds
[
j
]
=
Tensor
::
buffer
(
dtype
,
{
h
*
w
,
hidden_size
},
rsrc
.
memory_pool
);
// 使用索引和权重获取对应位置嵌入,并乘以权重
for
(
size_t
i
=
0
;
i
<
h
*
w
;
i
++
)
{
rearrange
(
pos_embeds
[
j
]
->
slice
(
0
,
i
,
1
),
pos_embed_weight
->
slice
(
0
,
indices
[
j
][
i
],
1
));
}
for
(
size_t
i
=
0
;
i
<
h
*
w
;
i
++
)
{
uint16_t
w_value
=
f32_to_bf16
(
weights
[
j
][
i
]);
for
(
size_t
k
=
0
;
k
<
hidden_size
;
k
++
)
{
weight_array
[
i
*
hidden_size
+
k
]
=
w_value
;
}
}
RUN_INFINI
(
infinirtMemcpyAsync
(
weight_tensor
->
data
(),
weight_array
.
data
(),
sizeof
(
uint16_t
)
*
h
*
w
*
hidden_size
,
INFINIRT_MEMCPY_H2D
,
rsrc
.
stream
));
mul
(
pos_embeds
[
j
],
pos_embeds
[
j
],
weight_tensor
);
}
// 合并四个方向的结果
auto
patch_pos_embed
=
pos_embeds
[
0
];
// [h*w, hidden_size]
for
(
int
j
=
1
;
j
<
4
;
++
j
)
{
add
(
patch_pos_embed
,
patch_pos_embed
,
pos_embeds
[
j
]);
}
// 对于视频帧数T>1的情况,重复patch_pos_embed T次
if
(
t
>
1
)
{
auto
temp_patch_pos_embed
=
Tensor
::
buffer
(
dtype
,
{
t
,
h
*
w
,
hidden_size
},
rsrc
.
memory_pool
);
for
(
size_t
i
=
0
;
i
<
t
;
i
++
)
{
rearrange
(
temp_patch_pos_embed
->
slice
(
0
,
i
,
1
),
patch_pos_embed
);
}
patch_pos_embed
=
temp_patch_pos_embed
;
}
printf
(
"merge patch pos embed/n"
);
fflush
(
stdout
);
patch_pos_embed
=
patch_pos_embed
->
view
({
t
,
h
/
merge_size
,
merge_size
,
w
/
merge_size
,
merge_size
,
hidden_size
})
->
permute
({
0
,
1
,
3
,
2
,
4
,
5
})
->
view
({
t
*
h
*
w
,
hidden_size
});
// 可能因为内存不连续无法再view
rearrange
(
patch_pos_embeds
->
slice
(
0
,
total_pixels_offset
,
t
*
h
*
w
),
patch_pos_embed
);
total_pixels_offset
+=
t
*
h
*
w
;
}
return
patch_pos_embeds
;
}
inline
auto
rot_pos_embed
(
const
Qwen3vlMeta
&
meta
,
Qwen3vlDeviceResource
&
rsrc
,
uint32_t
*
grid_thw
,
uint32_t
num_batch
,
uint32_t
total_patches
)
{
auto
dtype
=
meta
.
dtype
;
auto
hidden_size
=
meta
.
vis_meta
.
hidden_size
;
auto
num_heads
=
meta
.
vis_meta
.
num_heads
;
auto
head_dim
=
hidden_size
/
num_heads
;
auto
merge_size
=
meta
.
vis_meta
.
spatial_merge_size
;
std
::
vector
<
std
::
vector
<
uint32_t
>>
pos_ids_table_y
(
total_patches
,
std
::
vector
<
uint32_t
>
(
head_dim
/
4
));
std
::
vector
<
std
::
vector
<
uint32_t
>>
pos_ids_table_x
(
total_patches
,
std
::
vector
<
uint32_t
>
(
head_dim
/
4
));
for
(
uint32_t
b
=
0
;
b
<
num_batch
;
++
b
)
{
uint32_t
offset
=
b
*
3
;
uint32_t
num_frames
=
grid_thw
[
offset
+
0
];
uint32_t
height
=
grid_thw
[
offset
+
1
];
uint32_t
width
=
grid_thw
[
offset
+
2
];
uint32_t
merged_h
=
height
/
merge_size
;
uint32_t
merged_w
=
width
/
merge_size
;
// 遍历所有块和块内位置
size_t
patch_offset
=
0
;
for
(
uint32_t
bh
=
0
;
bh
<
merged_h
;
++
bh
)
{
for
(
uint32_t
bw
=
0
;
bw
<
merged_w
;
++
bw
)
{
for
(
uint32_t
ih
=
0
;
ih
<
merge_size
;
++
ih
)
{
for
(
uint32_t
iw
=
0
;
iw
<
merge_size
;
++
iw
)
{
uint32_t
row
=
bh
*
merge_size
+
ih
;
uint32_t
col
=
bw
*
merge_size
+
iw
;
// 如果是多帧,重复 num_frames 次
for
(
uint32_t
f
=
0
;
f
<
num_frames
;
++
f
)
{
size_t
dim_offset
=
0
;
for
(;
dim_offset
<
head_dim
/
4
;
dim_offset
++
)
{
pos_ids_table_y
[
patch_offset
][
dim_offset
]
=
row
;
pos_ids_table_x
[
patch_offset
][
dim_offset
]
=
col
;
}
patch_offset
++
;
}
}
}
}
}
}
auto
sin
=
Tensor
::
buffer
(
dtype
,
{
total_patches
,
head_dim
/
2
},
rsrc
.
memory_pool
);
auto
sin_y
=
get_custom_SinTable
(
meta
,
pos_ids_table_y
,
head_dim
/
2
,
10000
);
rearrange
(
sin
->
slice
(
1
,
0
,
head_dim
/
4
),
sin_y
);
auto
sin_x
=
get_custom_SinTable
(
meta
,
pos_ids_table_x
,
head_dim
/
2
,
10000
);
rearrange
(
sin
->
slice
(
1
,
head_dim
/
4
,
head_dim
/
2
),
sin_y
);
auto
cos
=
Tensor
::
buffer
(
dtype
,
{
total_patches
,
head_dim
/
2
},
rsrc
.
memory_pool
);
auto
cos_y
=
get_custom_CosTable
(
meta
,
pos_ids_table_y
,
head_dim
/
2
,
10000
);
rearrange
(
cos
->
slice
(
1
,
0
,
head_dim
/
4
),
cos_y
);
auto
cos_x
=
get_custom_CosTable
(
meta
,
pos_ids_table_x
,
head_dim
/
2
,
10000
);
rearrange
(
cos
->
slice
(
1
,
head_dim
/
4
,
head_dim
/
2
),
cos_y
);
return
std
::
pair
{
sin
,
cos
};
}
void
inferDeviceBatchVision
(
const
Qwen3vlMeta
&
meta
,
Qwen3vlDeviceResource
&
rsrc
,
uint32_t
idev
,
uint32_t
ndev
,
InferRequest
&
req
)
{
void
*
pixel_values
=
req
.
pixel_values
;
uint32_t
total_patches
=
req
.
total_patches
;
uint32_t
*
image_grid_thw
=
req
.
image_grid_thw
;
uint32_t
num_images
=
req
.
num_images
;
void
*
pixel_values_videos
=
req
.
pixel_values_videos
;
uint32_t
total_patches_videos
=
req
.
total_patches_videos
;
// uint32_t *video_grid_thw = req.video_grid_thw;
// uint32_t num_videos = req.num_videos;
// uint32_t patch_features = req.patch_features;
auto
dtype
=
meta
.
dtype
;
auto
d
=
meta
.
vis_meta
.
hidden_size
;
auto
channels
=
meta
.
vis_meta
.
in_channels
;
auto
patch_size
=
meta
.
vis_meta
.
patch_size
;
auto
temporal_patch_size
=
meta
.
vis_meta
.
temporal_patch_size
;
// auto stream = rsrc.stream;
auto
weights
=
rsrc
.
weights
;
auto
image_tensor
=
Tensor
::
weight
(
pixel_values
,
dtype
,
{
total_patches
,
channels
*
temporal_patch_size
*
patch_size
*
patch_size
});
auto
video_tensor
=
Tensor
::
weight
(
pixel_values_videos
,
dtype
,
{
total_patches_videos
,
channels
*
temporal_patch_size
*
patch_size
*
patch_size
});
auto
hidden_states
=
Tensor
::
buffer
(
dtype
,
{
total_patches
,
d
,
1
,
1
,
1
},
rsrc
.
memory_pool
);
std
::
vector
<
size_t
>
pads
=
{
0
,
0
,
0
};
std
::
vector
<
ptrdiff_t
>
strides
=
{
static_cast
<
long
>
(
temporal_patch_size
),
static_cast
<
long
>
(
patch_size
),
static_cast
<
long
>
(
patch_size
)};
std
::
vector
<
size_t
>
dilations
=
{
1
,
1
,
1
};
conv
(
hidden_states
,
image_tensor
,
rsrc
.
weights
->
w_vis
->
patch_embed_weight
,
rsrc
.
weights
->
w_vis
->
patch_embed_bias
,
pads
.
data
(),
strides
.
data
(),
dilations
.
data
(),
3
);
hidden_states
=
hidden_states
->
view
({
total_patches
,
d
});
auto
pos_embeds
=
fast_pos_embed_interpolate
(
meta
,
rsrc
,
image_grid_thw
,
num_images
,
total_patches
);
add
(
hidden_states
,
hidden_states
,
pos_embeds
);
auto
[
sin
,
cos
]
=
rot_pos_embed
(
meta
,
rsrc
,
image_grid_thw
,
num_images
,
total_patches
);
}
void
inferDeviceBatchText
(
const
Qwen3vlMeta
&
meta
,
Qwen3vlDeviceResource
&
rsrc
,
uint32_t
idev
,
uint32_t
ndev
,
InferRequest
&
req
)
{
const
uint32_t
*
tokens
=
req
.
tokens
;
uint32_t
ntok
=
req
.
ntok
;
const
uint32_t
*
req_lens
=
req
.
req_lens
;
uint32_t
nreq
=
req
.
nreq
;
const
uint32_t
*
req_pos
=
req
.
req_pos
;
struct
Qwen3vlCache
**
caches
=
req
.
kv_caches
;
const
float
*
temperature
=
req
.
temperature
;
const
uint32_t
*
topk
=
req
.
topk
;
const
float
*
topp
=
req
.
topp
;
uint32_t
*
output
=
req
.
output
;
void
*
last_logits
=
req
.
logits
;
assert
(
meta
.
text_meta
.
num_attention_heads
%
ndev
==
0
);
assert
(
meta
.
text_meta
.
num_key_value_heads
%
ndev
==
0
);
auto
dtype
=
meta
.
dtype
;
auto
nlayer
=
meta
.
text_meta
.
num_hidden_layers
;
size_t
nh
=
meta
.
text_meta
.
num_attention_heads
/
size_t
(
ndev
);
size_t
nkvh
=
meta
.
text_meta
.
num_key_value_heads
/
size_t
(
ndev
);
auto
ngroup
=
nh
/
nkvh
;
auto
dh
=
meta
.
text_meta
.
head_dim
;
auto
d
=
meta
.
text_meta
.
hidden_size
;
auto
di
=
meta
.
text_meta
.
intermediate_size
/
size_t
(
ndev
);
auto
dvoc
=
meta
.
text_meta
.
vocab_size
;
float
epsilon
=
meta
.
text_meta
.
rms_norm_eps
;
auto
stream
=
rsrc
.
stream
;
auto
weights
=
rsrc
.
weights
;
// Allocate buffers
auto
logits_in
=
Tensor
::
buffer
(
dtype
,
{
ntok
,
d
},
rsrc
.
memory_pool
);
auto
logits_out
=
Tensor
::
buffer
(
dtype
,
{
ntok
,
d
},
rsrc
.
memory_pool
);
// 所有请求的当前token
auto
qkv_buf
=
Tensor
::
buffer
(
dtype
,
{
ntok
,
(
nh
+
nkvh
*
2
)
*
dh
},
rsrc
.
memory_pool
);
auto
o_buf
=
Tensor
::
buffer
(
dtype
,
{
ntok
,
nh
*
dh
},
rsrc
.
memory_pool
);
auto
gate_up_buf
=
Tensor
::
buffer
(
dtype
,
{
ntok
,
2
*
di
},
rsrc
.
memory_pool
);
auto
prob_buf
=
Tensor
::
buffer
(
dtype
,
{
nreq
,
dvoc
},
rsrc
.
memory_pool
);
auto
result_buf
=
Tensor
::
buffer
(
INFINI_DTYPE_I64
,
{
nreq
},
rsrc
.
memory_pool
);
auto
result_cpu
=
std
::
vector
<
int64_t
>
(
nreq
);
auto
qkv_rope
=
qkv_buf
->
view
({
ntok
,
nh
+
nkvh
*
2
,
dh
});
auto
q_buf
=
qkv_rope
->
slice
(
1
,
0
,
nh
);
auto
k_buf
=
qkv_rope
->
slice
(
1
,
nh
,
nkvh
);
// Prepare inputs
auto
batch_pos_ids
=
std
::
vector
<
uint32_t
>
(
ntok
);
size_t
req_start
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
for
(
uint32_t
i
=
0
;
i
<
req_lens
[
req
];
i
++
)
{
// req_len 本次query长度,req_pos 历史长度
batch_pos_ids
[
req_start
+
i
]
=
req_pos
[
req
]
+
i
;
// batch_pos_ids 展平后每个token的pos
}
req_start
+=
req_lens
[
req
];
}
std
::
shared_ptr
<
Tensor
>
pos_ids_buf
;
if
(
rsrc
.
device
==
INFINI_DEVICE_CPU
)
{
pos_ids_buf
=
Tensor
::
weight
(
batch_pos_ids
.
data
(),
INFINI_DTYPE_U32
,
{
ntok
});
}
else
{
pos_ids_buf
=
Tensor
::
buffer
(
INFINI_DTYPE_U32
,
{
ntok
},
rsrc
.
memory_pool
);
RUN_INFINI
(
infinirtMemcpyAsync
(
pos_ids_buf
->
data
(),
batch_pos_ids
.
data
(),
sizeof
(
uint32_t
)
*
ntok
,
INFINIRT_MEMCPY_H2D
,
stream
));
}
// convert tokens to embeddings
for
(
uint32_t
i
=
0
;
i
<
ntok
;
i
++
)
{
RUN_INFINI
(
infinirtMemcpyAsync
(
logits_in
->
data
(
i
*
d
),
weights
->
w_lang
->
in_embd
->
data
(
tokens
[
i
]
*
d
),
dsize
(
dtype
)
*
d
,
INFINIRT_MEMCPY_D2D
,
stream
));
}
// attention inner
size_t
max_qk_size
=
0
;
size_t
max_seq_len
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
past_len
=
req_pos
[
req
];
auto
seq_len
=
req_lens
[
req
];
auto
total_len
=
past_len
+
seq_len
;
max_qk_size
=
std
::
max
(
max_qk_size
,
size_t
(
seq_len
*
total_len
));
max_seq_len
=
std
::
max
(
max_seq_len
,
size_t
(
seq_len
));
}
auto
qk_buf
=
Tensor
::
buffer
(
dtype
,
{
nh
*
max_qk_size
},
rsrc
.
memory_pool
);
auto
rearrange_q_buf
=
Tensor
::
buffer
(
dtype
,
{
nkvh
,
ngroup
*
max_seq_len
,
dh
},
rsrc
.
memory_pool
);
auto
q_rearrange
=
rearrange_q_buf
->
view
({
nkvh
,
ngroup
,
max_seq_len
,
dh
});
auto
attn_val_buf
=
Tensor
::
buffer
(
dtype
,
{
nkvh
,
ngroup
*
max_seq_len
,
dh
},
rsrc
.
memory_pool
);
auto
attn_val_gemm
=
attn_val_buf
->
view
({
nkvh
,
ngroup
,
max_seq_len
,
dh
});
auto
gate_buf
=
gate_up_buf
->
slice
(
1
,
0
,
di
);
auto
up_buf
=
gate_up_buf
->
slice
(
1
,
di
,
di
);
// Compute
for
(
uint32_t
i
=
0
;
i
<
nlayer
;
i
++
)
{
// attn norm
rmsnorm
(
logits_out
,
logits_in
,
weights
->
w_lang
->
layers
[
i
].
attn_norm
,
epsilon
);
// qkv_proj
linear
(
qkv_buf
,
logits_out
,
weights
->
w_lang
->
layers
[
i
].
attn_qkv_proj
,
1.0
,
0.0
,
nullptr
,
nullptr
);
// qk_norm
rmsnorm
(
q_buf
,
q_buf
,
weights
->
w_lang
->
layers
[
i
].
attn_q_norm
,
epsilon
);
rmsnorm
(
k_buf
,
k_buf
,
weights
->
w_lang
->
layers
[
i
].
attn_k_norm
,
epsilon
);
// rope
rope_v2
(
q_buf
,
q_buf
,
pos_ids_buf
,
weights
->
sin_table
,
weights
->
cos_table
);
rope_v2
(
k_buf
,
k_buf
,
pos_ids_buf
,
weights
->
sin_table
,
weights
->
cos_table
);
// 逐个req处理
size_t
token_offset
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
past_len
=
req_pos
[
req
];
auto
seq_len
=
req_lens
[
req
];
auto
total_len
=
past_len
+
seq_len
;
auto
o
=
o_buf
->
slice
(
0
,
token_offset
,
seq_len
)
->
view
({
seq_len
,
nkvh
,
ngroup
,
dh
})
->
permute
({
1
,
2
,
0
,
3
});
// [nkvh, ngroup, seq_len, dh]
auto
q
=
qkv_rope
->
slice
({{
0
,
token_offset
,
seq_len
},
{
1
,
0
,
nh
}})
->
view
({
seq_len
,
nkvh
,
ngroup
,
dh
})
->
permute
({
1
,
2
,
0
,
3
});
// [nkvh, ngroup, seq_len, dh]
auto
k
=
qkv_rope
->
slice
({{
0
,
token_offset
,
seq_len
},
{
1
,
nh
,
nkvh
}});
// [ntok, nkvh, dh]
auto
v
=
qkv_rope
->
slice
({{
0
,
token_offset
,
seq_len
},
{
1
,
nh
+
nkvh
,
nkvh
}});
// [ntok, nkvh, dh]
// concat to cache
rearrange
(
caches
[
req
]
->
k_rot
[
idev
][
i
]
->
slice
(
0
,
past_len
,
seq_len
),
k
);
rearrange
(
caches
[
req
]
->
v
[
idev
][
i
]
->
slice
(
0
,
past_len
,
seq_len
),
v
);
// fill full_k full_v
auto
full_k_buff
=
caches
[
req
]
->
k_rot
[
idev
][
i
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
2
,
0
});
// [nkvh, dh, total_len]
auto
full_v_buff
=
caches
[
req
]
->
v
[
idev
][
i
]
->
slice
(
0
,
0
,
total_len
)
->
permute
({
1
,
0
,
2
});
// [nkvh, total_len, dh]
// self-attn
rearrange
(
q_rearrange
->
slice
(
2
,
0
,
seq_len
),
q
);
auto
attn_score_req
=
qk_buf
->
slice
(
0
,
0
,
nh
*
seq_len
*
total_len
)
->
view
({
nkvh
,
ngroup
*
seq_len
,
total_len
});
// [nkvh, ngroup * seq_len, dh] @ [nkvh, dh, total_len] = [nkvh, ngroup * seq_len, total_len]
linear
(
attn_score_req
,
rearrange_q_buf
->
slice
(
1
,
0
,
ngroup
*
seq_len
),
full_k_buff
,
1.
f
/
float
(
sqrt
(
dh
)),
0.
f
,
nullptr
,
nullptr
);
// softmax
auto
qk_softmax
=
attn_score_req
->
view
({
nh
,
seq_len
,
total_len
});
causalSoftmax
(
qk_softmax
,
qk_softmax
);
// [nkvh, ngroup * seq_len, total_len] @ [nkvh, total_len, dh] = [nkvh, ngroup * seq_len, dh]
linear
(
attn_val_buf
->
slice
(
1
,
0
,
ngroup
*
seq_len
),
attn_score_req
,
full_v_buff
,
1.0
,
0.0
,
nullptr
,
nullptr
);
// printf("rearrage o; layer[%d]\n",i);
rearrange
(
o
,
attn_val_gemm
->
slice
(
2
,
0
,
seq_len
));
token_offset
+=
seq_len
;
}
linear
(
logits_in
,
o_buf
,
weights
->
w_lang
->
layers
[
i
].
attn_o_proj
,
1.0
,
0.0
,
idev
==
0
?
logits_in
:
nullptr
,
nullptr
);
// All_reduce if distributed
if
(
rsrc
.
comm
!=
nullptr
)
{
RUN_INFINI
(
infinicclAllReduce
(
logits_in
->
data
(),
logits_in
->
data
(),
ntok
*
d
,
dtype
,
INFINICCL_SUM
,
rsrc
.
comm
,
stream
));
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
}
// mlp norm
rmsnorm
(
logits_out
,
logits_in
,
weights
->
w_lang
->
layers
[
i
].
mlp_norm
,
epsilon
);
// mlp gate_up
linear
(
gate_up_buf
,
logits_out
,
weights
->
w_lang
->
layers
[
i
].
mlp_gate_up
,
1.0
,
0.0
,
nullptr
,
nullptr
);
// silu
silu
(
gate_buf
,
gate_buf
);
mul
(
gate_buf
,
gate_buf
,
up_buf
);
// mlp down
linear
(
logits_in
,
gate_buf
,
weights
->
w_lang
->
layers
[
i
].
mlp_down
,
1.0
,
0.0
,
idev
==
0
?
logits_in
:
nullptr
,
nullptr
);
// All_reduce if distributed
if
(
rsrc
.
comm
!=
nullptr
)
{
RUN_INFINI
(
infinicclAllReduce
(
logits_in
->
data
(),
logits_in
->
data
(),
ntok
*
d
,
dtype
,
INFINICCL_SUM
,
rsrc
.
comm
,
stream
));
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
}
}
// sample and output
if
(
idev
==
0
)
{
if
(
last_logits
!=
nullptr
)
{
rmsnorm
(
logits_out
,
logits_in
,
weights
->
w_lang
->
out_norm
,
epsilon
);
auto
last_logits_buf
=
Tensor
::
buffer
(
dtype
,
{
ntok
,
dvoc
},
rsrc
.
memory_pool
);
linear
(
last_logits_buf
,
logits_out
,
weights
->
w_lang
->
out_embd
,
1.0
,
0.0
,
nullptr
,
nullptr
);
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
RUN_INFINI
(
infinirtMemcpy
(
last_logits
,
last_logits_buf
->
data
(),
dsize
(
dtype
)
*
ntok
*
dvoc
,
INFINIRT_MEMCPY_D2H
));
}
if
(
output
!=
nullptr
)
{
size_t
token_offset
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
seq_len
=
req_lens
[
req
];
token_offset
+=
seq_len
;
rmsnorm
(
logits_out
->
slice
(
0
,
req
,
1
),
logits_in
->
slice
(
0
,
token_offset
-
1
,
1
),
weights
->
w_lang
->
out_norm
,
epsilon
);
}
linear
(
prob_buf
,
logits_out
->
slice
(
0
,
0
,
nreq
),
weights
->
w_lang
->
out_embd
,
1.0
,
0.0
,
nullptr
,
nullptr
);
std
::
random_device
_rd
;
std
::
mt19937
gen
(
_rd
());
token_offset
=
0
;
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
auto
seq_len
=
req_lens
[
req
];
float
random_val
=
std
::
uniform_real_distribution
<
float
>
(
0
,
1
)(
gen
);
randomSample
(
result_buf
->
slice
(
0
,
req
,
1
)
->
view_as
({},
{}),
prob_buf
->
slice
(
0
,
req
,
1
)
->
view_as
({
dvoc
},
{
1
}),
random_val
,
topp
[
req
],
topk
[
req
],
temperature
[
req
]);
token_offset
+=
seq_len
;
}
RUN_INFINI
(
infinirtStreamSynchronize
(
stream
));
RUN_INFINI
(
infinirtMemcpy
(
result_cpu
.
data
(),
result_buf
->
data
(),
sizeof
(
int64_t
)
*
nreq
,
INFINIRT_MEMCPY_D2H
));
for
(
uint32_t
req
=
0
;
req
<
nreq
;
req
++
)
{
output
[
req
]
=
uint32_t
(
result_cpu
[
req
]);
}
}
}
}
void
inferDeviceBatch
(
const
Qwen3vlMeta
&
meta
,
Qwen3vlDeviceResource
&
rsrc
,
uint32_t
idev
,
uint32_t
ndev
,
InferState
&
state
,
InferRequest
&
req
)
{
// infer vision + sync
if
(
req
.
num_images
>
0
||
req
.
num_videos
>
0
)
{
inferDeviceBatchVision
(
meta
,
rsrc
,
idev
,
ndev
,
req
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
state
.
mtx_sync
);
state
.
sync_cnt
--
;
if
(
state
.
sync_cnt
==
0
)
{
state
.
cv_sync
.
notify_all
();
}
else
{
state
.
cv_sync
.
wait
(
lock
,
[
&
]
{
return
state
.
sync_cnt
==
0
;
});
}
}
// infer text
inferDeviceBatchText
(
meta
,
rsrc
,
idev
,
ndev
,
req
);
}
__INFINI_C
void
inferBatchQwen3vl
(
struct
Qwen3vlModel
*
model
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
void
*
pixel_values
,
uint32_t
total_patches
,
uint32_t
*
image_grid_thw
,
uint32_t
num_images
,
void
*
pixel_values_videos
,
uint32_t
total_patches_videos
,
uint32_t
*
video_grid_thw
,
uint32_t
num_videos
,
uint32_t
patch_features
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
Qwen3vlCache
**
kv_caches
,
const
float
*
temperature
,
const
uint32_t
*
topk
,
const
float
*
topp
,
uint32_t
*
output
)
{
model
->
req
.
tokens
=
tokens
;
model
->
req
.
ntok
=
ntok
;
model
->
req
.
pixel_values
=
pixel_values
;
model
->
req
.
total_patches
=
total_patches
;
model
->
req
.
image_grid_thw
=
image_grid_thw
;
model
->
req
.
num_images
=
num_images
;
model
->
req
.
pixel_values_videos
=
pixel_values_videos
;
model
->
req
.
total_patches_videos
=
total_patches_videos
;
model
->
req
.
video_grid_thw
=
video_grid_thw
;
model
->
req
.
num_videos
=
num_videos
;
model
->
req
.
patch_features
=
patch_features
;
model
->
req
.
req_lens
=
req_lens
;
model
->
req
.
nreq
=
nreq
;
model
->
req
.
req_pos
=
req_pos
;
model
->
req
.
kv_caches
=
kv_caches
;
model
->
req
.
output
=
output
;
model
->
req
.
logits
=
nullptr
;
model
->
req
.
temperature
=
temperature
;
model
->
req
.
topk
=
topk
;
model
->
req
.
topp
=
topp
;
model
->
states
[
0
].
sync_cnt
=
model
->
dev_ids
.
size
();
for
(
size_t
idev
=
0
;
idev
<
model
->
dev_ids
.
size
();
idev
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
proceed
=
true
;
lock
.
unlock
();
model
->
states
[
idev
].
cv_start
.
notify_one
();
}
for
(
size_t
i
=
model
->
dev_ids
.
size
();
i
>
0
;
i
--
)
{
auto
idev
=
i
-
1
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
cv_done
.
wait
(
lock
,
[
&
]
{
return
!
(
model
->
states
[
idev
].
proceed
);
});
lock
.
unlock
();
}
}
__INFINI_C
void
forwardBatchQwen3vl
(
struct
Qwen3vlModel
*
model
,
const
uint32_t
*
tokens
,
uint32_t
ntok
,
void
*
pixel_values
,
uint32_t
total_patches
,
uint32_t
*
image_grid_thw
,
uint32_t
num_images
,
void
*
pixel_values_videos
,
uint32_t
total_patches_videos
,
uint32_t
*
video_grid_thw
,
uint32_t
num_videos
,
uint32_t
patch_features
,
const
uint32_t
*
req_lens
,
uint32_t
nreq
,
const
uint32_t
*
req_pos
,
struct
Qwen3vlCache
**
kv_caches
,
void
*
logits
)
{
model
->
req
.
tokens
=
tokens
;
model
->
req
.
ntok
=
ntok
;
model
->
req
.
pixel_values
=
pixel_values
;
model
->
req
.
total_patches
=
total_patches
;
model
->
req
.
image_grid_thw
=
image_grid_thw
;
model
->
req
.
num_images
=
num_images
;
model
->
req
.
pixel_values_videos
=
pixel_values_videos
;
model
->
req
.
total_patches_videos
=
total_patches_videos
;
model
->
req
.
video_grid_thw
=
video_grid_thw
;
model
->
req
.
num_videos
=
num_videos
;
model
->
req
.
patch_features
=
patch_features
;
model
->
req
.
req_lens
=
req_lens
;
model
->
req
.
nreq
=
nreq
;
model
->
req
.
req_pos
=
req_pos
;
model
->
req
.
kv_caches
=
kv_caches
;
model
->
req
.
output
=
nullptr
;
model
->
req
.
logits
=
logits
;
model
->
req
.
temperature
=
nullptr
;
model
->
req
.
topk
=
nullptr
;
model
->
req
.
topp
=
nullptr
;
model
->
states
[
0
].
sync_cnt
=
model
->
dev_ids
.
size
();
for
(
size_t
idev
=
0
;
idev
<
model
->
dev_ids
.
size
();
idev
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
proceed
=
true
;
lock
.
unlock
();
model
->
states
[
idev
].
cv_start
.
notify_one
();
}
for
(
size_t
i
=
model
->
dev_ids
.
size
();
i
>
0
;
i
--
)
{
auto
idev
=
i
-
1
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
cv_done
.
wait
(
lock
,
[
&
]
{
return
!
(
model
->
states
[
idev
].
proceed
);
});
lock
.
unlock
();
}
}
void
launchDevice
(
const
Qwen3vlMeta
&
meta
,
std
::
shared_ptr
<
Qwen3vlDeviceWeights
>
weights
,
Qwen3vlDeviceResource
*
rsrc
,
InferState
&
state
,
InferRequest
&
req
,
infiniDevice_t
device
,
int
idev
,
int
ndev
,
int
dev_id
,
infinicclComm_t
comm
)
{
// Create Device Resource
createDeviceResource
(
rsrc
,
&
meta
,
weights
,
device
,
idev
,
ndev
,
dev_id
,
comm
);
CacheManager
cache_manager
(
100
);
InferenceContext
ctx
(
rsrc
->
handle
,
rsrc
->
memory_pool
,
&
cache_manager
,
rsrc
->
stream
);
// Set the inference context for this thread
setInferenceContext
(
&
ctx
);
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
state
.
mtx
);
state
.
loaded
=
true
;
lock
.
unlock
();
state
.
cv_load
.
notify_one
();
}
// Infer Loop
while
(
true
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
state
.
mtx
);
state
.
cv_start
.
wait
(
lock
,
[
&
]
{
return
state
.
proceed
||
state
.
exit_flag
;
});
// quit if exit_flag is set
if
(
state
.
exit_flag
)
{
break
;
}
inferDeviceBatch
(
meta
,
*
rsrc
,
idev
,
ndev
,
state
,
req
);
state
.
proceed
=
false
;
lock
.
unlock
();
state
.
cv_done
.
notify_one
();
}
// Clean-Up
releaseDeviceResource
(
*
rsrc
);
setInferenceContext
(
nullptr
);
// Clear the context when done
}
Qwen3vlModel
::
Qwen3vlModel
(
const
Qwen3vlMeta
*
_meta
,
const
Qwen3vlWeights
*
weights
)
:
meta
(
*
_meta
)
{
auto
device_weights
=
weights
->
device_weights
;
int
ndev
=
device_weights
.
size
();
device
=
device_weights
[
0
]
->
device
;
dev_ids
.
resize
(
ndev
);
for
(
int
i
=
0
;
i
<
ndev
;
i
++
)
{
dev_ids
[
i
]
=
device_weights
[
i
]
->
dev_id
;
}
dev_resources
=
std
::
vector
<
Qwen3vlDeviceResource
>
(
ndev
);
states
=
std
::
vector
<
InferState
>
(
ndev
);
threads
.
resize
(
ndev
);
RUN_INFINI
(
infinirtInit
());
auto
comms
=
std
::
vector
<
infinicclComm_t
>
(
ndev
,
nullptr
);
if
(
ndev
>
1
)
{
RUN_INFINI
(
infinicclCommInitAll
(
device
,
comms
.
data
(),
ndev
,
dev_ids
.
data
()));
}
for
(
int
i
=
0
;
i
<
ndev
;
i
++
)
{
threads
[
i
]
=
std
::
thread
(
launchDevice
,
std
::
cref
(
meta
),
device_weights
[
i
],
&
dev_resources
[
i
],
std
::
ref
(
states
[
i
]),
std
::
ref
(
req
),
device
,
i
,
ndev
,
dev_ids
[
i
],
comms
[
i
]);
}
for
(
int
i
=
0
;
i
<
ndev
;
i
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
states
[
i
].
mtx
);
states
[
i
].
cv_load
.
wait
(
lock
,
[
&
]
{
return
states
[
i
].
loaded
;
});
lock
.
unlock
();
}
}
__INFINI_C
struct
Qwen3vlModel
*
createQwen3vlModel
(
const
Qwen3vlMeta
*
_meta
,
const
Qwen3vlWeights
*
weights
)
{
Qwen3vlModel
*
model
=
new
Qwen3vlModel
(
_meta
,
weights
);
return
model
;
}
__INFINI_C
void
destroyQwen3vlModel
(
struct
Qwen3vlModel
*
model
)
{
auto
ndev
=
model
->
dev_resources
.
size
();
for
(
size_t
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
model
->
states
[
idev
].
mtx
);
model
->
states
[
idev
].
exit_flag
=
true
;
lock
.
unlock
();
model
->
states
[
idev
].
cv_start
.
notify_one
();
}
for
(
size_t
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
model
->
threads
[
idev
].
join
();
}
delete
model
;
}
src/models/qwen3vl/qwen3vl_cache.cpp
0 → 100644
View file @
cfe4b1a8
#include "qwen3vl_impl.hpp"
__INFINI_C
struct
Qwen3vlCache
*
createQwen3vlCache
(
const
struct
Qwen3vlModel
*
model
)
{
Qwen3vlCache
*
cache
=
new
Qwen3vlCache
();
auto
ndev
=
model
->
dev_resources
.
size
();
auto
nlayer
=
model
->
meta
.
text_meta
.
num_hidden_layers
;
auto
max_len
=
model
->
meta
.
text_meta
.
max_tokens
;
auto
dh
=
model
->
meta
.
text_meta
.
head_dim
;
auto
nkv
=
model
->
meta
.
text_meta
.
num_key_value_heads
/
size_t
(
ndev
);
auto
k_rot_shape
=
std
::
vector
<
size_t
>
{
max_len
,
nkv
,
dh
};
auto
v_shape
=
std
::
vector
<
size_t
>
{
max_len
,
nkv
,
dh
};
for
(
size_t
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
RUN_INFINI
(
infinirtSetDevice
(
model
->
device
,
model
->
dev_ids
[
idev
]));
auto
k_rot_cache
=
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
();
auto
v_cache
=
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
();
for
(
size_t
layer
=
0
;
layer
<
nlayer
;
layer
++
)
{
k_rot_cache
.
push_back
(
std
::
move
(
Tensor
::
buffer
(
model
->
meta
.
dtype
,
k_rot_shape
)));
v_cache
.
push_back
(
std
::
move
(
Tensor
::
buffer
(
model
->
meta
.
dtype
,
v_shape
)));
}
cache
->
k_rot
.
push_back
(
k_rot_cache
);
cache
->
v
.
push_back
(
v_cache
);
}
return
cache
;
}
//////还有visual deepstack需要cache?
__INFINI_C
void
dropQwen3vlCache
(
const
struct
Qwen3vlModel
*
model
,
struct
Qwen3vlCache
*
cache
)
{
auto
ndev
=
model
->
dev_resources
.
size
();
auto
nlayer
=
model
->
meta
.
text_meta
.
num_hidden_layers
;
for
(
size_t
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
RUN_INFINI
(
infinirtSetDevice
(
model
->
device
,
model
->
dev_ids
[
idev
]));
for
(
size_t
layer
=
0
;
layer
<
nlayer
;
layer
++
)
{
cache
->
k_rot
[
idev
][
layer
].
reset
();
cache
->
v
[
idev
][
layer
].
reset
();
}
}
delete
cache
;
}
src/models/qwen3vl/qwen3vl_impl.hpp
0 → 100644
View file @
cfe4b1a8
#ifndef QWEN3VL_IMPL_H
#define QWEN3VL_IMPL_H
#include "infinicore_infer.h"
#include "../../allocator.hpp"
#include "../../tensor.hpp"
#include <condition_variable>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
struct
Qwen3vlLayerWeight
{
std
::
shared_ptr
<
Tensor
>
attn_norm
;
std
::
shared_ptr
<
Tensor
>
attn_qkv_proj
;
std
::
shared_ptr
<
Tensor
>
attn_q_norm
;
std
::
shared_ptr
<
Tensor
>
attn_k_norm
;
std
::
shared_ptr
<
Tensor
>
attn_o_proj
;
std
::
shared_ptr
<
Tensor
>
mlp_norm
;
std
::
shared_ptr
<
Tensor
>
mlp_gate_up
;
std
::
shared_ptr
<
Tensor
>
mlp_down
;
};
struct
Qwen3vlLanguageModelWeight
{
std
::
shared_ptr
<
Tensor
>
in_embd
,
out_embd
,
out_norm
;
std
::
vector
<
Qwen3vlLayerWeight
>
layers
;
};
struct
Qwen3vlVisBlockWeight
{
std
::
shared_ptr
<
Tensor
>
attn_proj_weight
,
attn_proj_bias
,
attn_qkv_weight
,
attn_qkv_bias
;
std
::
shared_ptr
<
Tensor
>
mlp_linear_fc1_weight
,
mlp_linear_fc1_bias
,
mlp_linear_fc2_weight
,
mlp_linear_fc2_bias
;
std
::
shared_ptr
<
Tensor
>
norm1_weight
,
norm1_bias
,
norm2_weight
,
norm2_bias
;
};
struct
DeepstackMergerWeight
{
std
::
shared_ptr
<
Tensor
>
linear_fc1_weight
,
linear_fc1_bias
,
linear_fc2_weight
,
linear_fc2_bias
;
std
::
shared_ptr
<
Tensor
>
norm_weight
,
norm_bias
;
};
struct
MergerWeight
{
std
::
shared_ptr
<
Tensor
>
linear_fc1_weight
,
linear_fc1_bias
,
linear_fc2_weight
,
linear_fc2_bias
;
std
::
shared_ptr
<
Tensor
>
norm_weight
,
norm_bias
;
};
struct
Qwen3vlVisualEncoderWeight
{
std
::
shared_ptr
<
Tensor
>
patch_embed_weight
,
patch_embed_bias
,
pos_embed_weight
;
std
::
vector
<
Qwen3vlVisBlockWeight
>
blocks
;
std
::
vector
<
DeepstackMergerWeight
>
deepstack_mergers
;
std
::
shared_ptr
<
MergerWeight
>
merger
;
};
struct
Qwen3vlDeviceWeights
{
std
::
shared_ptr
<
Tensor
>
sin_table
,
cos_table
;
std
::
shared_ptr
<
Qwen3vlLanguageModelWeight
>
w_lang
;
std
::
shared_ptr
<
Qwen3vlVisualEncoderWeight
>
w_vis
;
infiniDevice_t
device
;
int
dev_id
;
infinirtStream_t
load_stream
;
};
struct
Qwen3vlWeights
{
Qwen3vlMeta
const
*
meta
;
bool
transpose_weight
;
std
::
vector
<
std
::
shared_ptr
<
Qwen3vlDeviceWeights
>>
device_weights
;
Qwen3vlWeights
(
const
Qwen3vlMeta
*
meta
,
infiniDevice_t
device
,
int
ndev
,
const
int
*
dev_ids
,
bool
transpose_weight
);
};
struct
Qwen3vlDeviceResource
{
// Device
infiniDevice_t
device
;
int
device_id
;
infiniopHandle_t
handle
;
// Weights
std
::
shared_ptr
<
Qwen3vlDeviceWeights
>
weights
;
// Streams
infinirtStream_t
stream
;
// Communicator
infinicclComm_t
comm
;
std
::
shared_ptr
<
MemoryPool
>
memory_pool
;
};
struct
InferState
{
// qwen3vl namespace
inline
static
std
::
mutex
mtx_sync
;
inline
static
int
sync_cnt
;
inline
static
std
::
condition_variable
cv_sync
;
std
::
mutex
mtx
;
std
::
condition_variable
cv_load
,
cv_start
,
cv_done
;
bool
loaded
=
false
;
bool
proceed
=
false
;
bool
exit_flag
=
false
;
};
struct
InferRequest
{
// qwen3vl namespace
const
uint32_t
*
tokens
;
uint32_t
ntok
;
void
*
pixel_values
;
uint32_t
total_patches
;
uint32_t
*
image_grid_thw
;
uint32_t
num_images
;
void
*
pixel_values_videos
;
uint32_t
total_patches_videos
;
uint32_t
*
video_grid_thw
;
uint32_t
num_videos
;
uint32_t
patch_features
;
const
uint32_t
*
req_lens
;
uint32_t
nreq
;
const
uint32_t
*
req_pos
;
struct
Qwen3vlCache
**
kv_caches
;
const
float
*
temperature
;
const
uint32_t
*
topk
;
const
float
*
topp
;
uint32_t
*
output
;
void
*
logits
;
};
struct
Qwen3vlModel
{
Qwen3vlMeta
meta
;
infiniDevice_t
device
;
std
::
vector
<
int
>
dev_ids
;
std
::
vector
<
Qwen3vlDeviceResource
>
dev_resources
;
std
::
vector
<
InferState
>
states
;
std
::
vector
<
std
::
thread
>
threads
;
InferRequest
req
;
Qwen3vlModel
(
const
Qwen3vlMeta
*
,
const
Qwen3vlWeights
*
weights
);
};
struct
Qwen3vlCache
{
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>>
k_rot
,
v
;
};
#endif
src/models/qwen3vl/qwen3vl_weight.cpp
0 → 100644
View file @
cfe4b1a8
#include "qwen3vl_impl.hpp"
#include <cmath>
inline
std
::
shared_ptr
<
Tensor
>
getInEmbd
(
const
Qwen3vlMeta
*
meta
)
{
auto
shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
vocab_size
,
meta
->
text_meta
.
hidden_size
});
return
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
shape
);
}
inline
std
::
shared_ptr
<
Tensor
>
getOutNorm
(
const
Qwen3vlMeta
*
meta
)
{
auto
shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
hidden_size
});
return
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
shape
);
}
inline
std
::
shared_ptr
<
Tensor
>
getOutEmbd
(
const
Qwen3vlMeta
*
meta
)
{
auto
shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
vocab_size
,
meta
->
text_meta
.
hidden_size
});
return
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
shape
)
->
permute
({
1
,
0
});
}
inline
void
getLayerWeight
(
const
Qwen3vlMeta
*
meta
,
Qwen3vlLayerWeight
&
layer
,
int
ndev
)
{
auto
nkvh
=
meta
->
text_meta
.
num_key_value_heads
;
auto
nh
=
meta
->
text_meta
.
num_attention_heads
;
auto
dh
=
meta
->
text_meta
.
head_dim
;
auto
d
=
meta
->
text_meta
.
hidden_size
;
auto
di
=
meta
->
text_meta
.
intermediate_size
;
auto
dh_shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
hidden_size
});
layer
.
attn_norm
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
dh_shape
);
auto
qk_norm_shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
head_dim
});
layer
.
attn_q_norm
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
qk_norm_shape
);
layer
.
attn_k_norm
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
qk_norm_shape
);
auto
qkv_proj_shape
=
std
::
vector
<
size_t
>
({(
nh
+
2
*
nkvh
)
/
ndev
*
dh
,
d
});
layer
.
attn_qkv_proj
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
qkv_proj_shape
);
auto
o_proj_shape
=
std
::
vector
<
size_t
>
({
d
,
nh
/
ndev
*
dh
});
layer
.
attn_o_proj
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
o_proj_shape
);
layer
.
mlp_norm
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
dh_shape
);
auto
up_shape
=
std
::
vector
<
size_t
>
({
2
*
di
/
ndev
,
d
});
layer
.
mlp_gate_up
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
up_shape
);
auto
down_shape
=
std
::
vector
<
size_t
>
({
d
,
di
/
ndev
});
layer
.
mlp_down
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
down_shape
);
}
inline
void
getVisualWeight
(
const
Qwen3vlMeta
*
meta
,
std
::
shared_ptr
<
Qwen3vlVisualEncoderWeight
>
w_vis
)
{
Qwen3vlVisMeta
vis_meta
=
meta
->
vis_meta
;
auto
patch_embed_shape
=
std
::
vector
<
size_t
>
({
vis_meta
.
hidden_size
,
vis_meta
.
in_channels
,
vis_meta
.
temporal_patch_size
,
vis_meta
.
patch_size
,
vis_meta
.
patch_size
});
w_vis
->
patch_embed_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
patch_embed_shape
);
w_vis
->
patch_embed_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
pos_embed_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
num_position_embeddings
,
vis_meta
.
hidden_size
});
w_vis
->
merger
=
std
::
make_shared
<
MergerWeight
>
();
w_vis
->
merger
->
linear_fc1_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
,
vis_meta
.
intermediate_size
});
w_vis
->
merger
->
linear_fc2_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
out_hidden_size
,
vis_meta
.
intermediate_size
});
w_vis
->
merger
->
linear_fc1_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
});
w_vis
->
merger
->
linear_fc2_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
out_hidden_size
});
w_vis
->
merger
->
norm_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
merger
->
norm_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
blocks
=
std
::
vector
<
Qwen3vlVisBlockWeight
>
(
vis_meta
.
depth
);
for
(
size_t
i
=
0
;
i
<
vis_meta
.
depth
;
i
++
)
{
w_vis
->
blocks
[
i
].
attn_proj_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
,
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
attn_proj_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
attn_qkv_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
in_channels
*
vis_meta
.
hidden_size
,
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
attn_qkv_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
in_channels
*
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
mlp_linear_fc1_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
,
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
mlp_linear_fc1_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
});
w_vis
->
blocks
[
i
].
mlp_linear_fc2_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
,
vis_meta
.
intermediate_size
});
w_vis
->
blocks
[
i
].
mlp_linear_fc2_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
norm1_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
norm1_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
norm2_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
w_vis
->
blocks
[
i
].
norm2_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
hidden_size
});
}
w_vis
->
deepstack_mergers
=
std
::
vector
<
DeepstackMergerWeight
>
(
3
);
for
(
size_t
i
=
0
;
i
<
3
;
i
++
)
{
w_vis
->
deepstack_mergers
[
i
].
linear_fc1_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
,
vis_meta
.
intermediate_size
});
w_vis
->
deepstack_mergers
[
i
].
linear_fc2_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
out_hidden_size
,
vis_meta
.
intermediate_size
});
w_vis
->
deepstack_mergers
[
i
].
linear_fc1_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
});
w_vis
->
deepstack_mergers
[
i
].
linear_fc2_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
out_hidden_size
});
w_vis
->
deepstack_mergers
[
i
].
norm_weight
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
});
w_vis
->
deepstack_mergers
[
i
].
norm_bias
=
Tensor
::
weight
(
nullptr
,
meta
->
dtype
,
{
vis_meta
.
intermediate_size
});
}
}
inline
std
::
shared_ptr
<
Tensor
>
getSinTable
(
const
Qwen3vlMeta
*
meta
)
{
auto
half_dh
=
meta
->
text_meta
.
head_dim
/
2
;
auto
unit
=
dsize
(
meta
->
dtype
);
void
*
table
=
std
::
malloc
(
meta
->
text_meta
.
max_tokens
*
half_dh
*
unit
);
for
(
size_t
i
=
0
;
i
<
meta
->
text_meta
.
max_tokens
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
half_dh
;
j
++
)
{
float
_sin
=
std
::
sin
(
static_cast
<
float
>
(
i
)
/
std
::
pow
(
meta
->
text_meta
.
rope_theta
,
static_cast
<
float
>
(
j
)
/
half_dh
));
if
(
meta
->
dtype
==
INFINI_DTYPE_F16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dh
+
j
]
=
f32_to_f16
(
_sin
);
}
else
if
(
meta
->
dtype
==
INFINI_DTYPE_BF16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dh
+
j
]
=
f32_to_bf16
(
_sin
);
}
else
if
(
meta
->
dtype
==
INFINI_DTYPE_F32
)
{
((
float
*
)
table
)[
i
*
half_dh
+
j
]
=
_sin
;
}
else
{
std
::
cout
<<
"unsupported data type"
<<
std
::
endl
;
exit
(
1
);
}
}
}
auto
shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
max_tokens
,
half_dh
});
auto
tensor
=
Tensor
::
weight
(
table
,
meta
->
dtype
,
shape
);
std
::
free
(
table
);
return
tensor
;
}
inline
std
::
shared_ptr
<
Tensor
>
getCosTable
(
const
Qwen3vlMeta
*
meta
)
{
auto
half_dh
=
meta
->
text_meta
.
head_dim
/
2
;
auto
unit
=
dsize
(
meta
->
dtype
);
void
*
table
=
std
::
malloc
(
meta
->
text_meta
.
max_tokens
*
half_dh
*
unit
);
for
(
size_t
i
=
0
;
i
<
meta
->
text_meta
.
max_tokens
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
half_dh
;
j
++
)
{
float
_cos
=
std
::
cos
(
static_cast
<
float
>
(
i
)
/
std
::
pow
(
meta
->
text_meta
.
rope_theta
,
static_cast
<
float
>
(
j
)
/
half_dh
));
if
(
meta
->
dtype
==
INFINI_DTYPE_F16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dh
+
j
]
=
f32_to_f16
(
_cos
);
}
else
if
(
meta
->
dtype
==
INFINI_DTYPE_BF16
)
{
((
uint16_t
*
)
table
)[
i
*
half_dh
+
j
]
=
f32_to_bf16
(
_cos
);
}
else
if
(
meta
->
dtype
==
INFINI_DTYPE_F32
)
{
((
float
*
)
table
)[
i
*
half_dh
+
j
]
=
_cos
;
}
else
{
std
::
cout
<<
"unsupported data type"
<<
std
::
endl
;
exit
(
1
);
}
}
}
auto
shape
=
std
::
vector
<
size_t
>
({
meta
->
text_meta
.
max_tokens
,
half_dh
});
auto
tensor
=
Tensor
::
weight
(
table
,
meta
->
dtype
,
shape
);
std
::
free
(
table
);
return
tensor
;
}
Qwen3vlWeights
::
Qwen3vlWeights
(
const
Qwen3vlMeta
*
_meta
,
infiniDevice_t
device
,
int
ndev
,
const
int
*
dev_ids
,
bool
_transpose_weight
)
{
meta
=
_meta
;
transpose_weight
=
_transpose_weight
;
device_weights
=
std
::
vector
<
std
::
shared_ptr
<
Qwen3vlDeviceWeights
>>
(
ndev
);
for
(
int
dev
=
0
;
dev
<
ndev
;
dev
++
)
{
int
dev_id
=
dev_ids
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
device
,
dev_id
));
device_weights
[
dev
]
=
std
::
make_shared
<
Qwen3vlDeviceWeights
>
();
device_weights
[
dev
]
->
device
=
device
;
device_weights
[
dev
]
->
dev_id
=
dev_id
;
RUN_INFINI
(
infinirtStreamCreate
(
&
device_weights
[
dev
]
->
load_stream
));
device_weights
[
dev
]
->
w_lang
=
std
::
make_shared
<
Qwen3vlLanguageModelWeight
>
();
device_weights
[
dev
]
->
w_vis
=
std
::
make_shared
<
Qwen3vlVisualEncoderWeight
>
();
device_weights
[
dev
]
->
w_lang
->
in_embd
=
getInEmbd
(
meta
);
device_weights
[
dev
]
->
w_lang
->
out_norm
=
getOutNorm
(
meta
);
device_weights
[
dev
]
->
w_lang
->
out_embd
=
getOutEmbd
(
meta
);
device_weights
[
dev
]
->
sin_table
=
getSinTable
(
meta
);
device_weights
[
dev
]
->
cos_table
=
getCosTable
(
meta
);
device_weights
[
dev
]
->
w_lang
->
layers
=
std
::
vector
<
Qwen3vlLayerWeight
>
(
meta
->
text_meta
.
num_hidden_layers
);
for
(
size_t
layer
=
0
;
layer
<
meta
->
text_meta
.
num_hidden_layers
;
layer
++
)
{
getLayerWeight
(
meta
,
device_weights
[
dev
]
->
w_lang
->
layers
[
layer
],
ndev
);
}
getVisualWeight
(
meta
,
device_weights
[
dev
]
->
w_vis
);
}
}
//--- Lang Global
void
load_input_embd
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading input embedding from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
in_embd
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_output_norm
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading output norm from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
out_norm
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_output_embd
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading output embedding from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
out_embd
->
load
(
cpu_ptr
,
weight
->
load_stream
);
if
(
weights
->
transpose_weight
)
{
weight
->
w_lang
->
out_embd
->
permute
({
1
,
0
});
//[d,voc]
}
}
}
// --- Attention
void
load_attn_norm
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading attention norm "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
attn_norm
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_attn_q_norm
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading attention q_norm "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
attn_q_norm
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_attn_qkv_proj
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading attention q_proj "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
int
ndev
=
int
(
weights
->
device_weights
.
size
());
auto
nkvh
=
weights
->
meta
->
text_meta
.
num_key_value_heads
;
auto
nh
=
weights
->
meta
->
text_meta
.
num_attention_heads
;
auto
dh
=
weights
->
meta
->
text_meta
.
head_dim
;
auto
d
=
weights
->
meta
->
text_meta
.
hidden_size
;
//[ndev,nh+2*nkvh,dh,d]
for
(
int
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
auto
weight
=
weights
->
device_weights
[
idev
];
size_t
offset
=
idev
*
((
nkvh
*
2
+
nh
)
/
ndev
*
dh
)
*
d
*
dsize
(
weights
->
meta
->
dtype
);
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
attn_qkv_proj
->
load
((
char
*
)
cpu_ptr
+
offset
,
weight
->
load_stream
);
if
(
weights
->
transpose_weight
)
{
weight
->
w_lang
->
layers
[
layer
].
attn_qkv_proj
=
weight
->
w_lang
->
layers
[
layer
].
attn_qkv_proj
->
permute
({
1
,
0
});
//[d, (nh+2*nkvh)*dh]
}
}
}
void
load_attn_k_norm
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading attention k_norm "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
attn_k_norm
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_attn_o_proj
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading attention o_proj "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
int
ndev
=
int
(
weights
->
device_weights
.
size
());
auto
nh
=
weights
->
meta
->
text_meta
.
num_attention_heads
;
auto
dh
=
weights
->
meta
->
text_meta
.
head_dim
;
auto
d
=
weights
->
meta
->
text_meta
.
hidden_size
;
// [ndev, d, nh // ndev * dh]
for
(
int
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
auto
weight
=
weights
->
device_weights
[
idev
];
size_t
offset
=
idev
*
d
*
(
nh
/
ndev
*
dh
)
*
dsize
(
weights
->
meta
->
dtype
);
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
attn_o_proj
->
load
((
char
*
)
cpu_ptr
+
offset
,
weight
->
load_stream
);
if
(
weights
->
transpose_weight
)
{
weight
->
w_lang
->
layers
[
layer
].
attn_o_proj
=
weight
->
w_lang
->
layers
[
layer
].
attn_o_proj
->
permute
({
1
,
0
});
//[nh/ndev*dh, d]
}
}
}
// --- MLP
void
load_mlp_norm
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading mlp norm "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
mlp_norm
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_mlp_gate_up
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading mlp gate "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
int
ndev
=
int
(
weights
->
device_weights
.
size
());
auto
di
=
weights
->
meta
->
text_meta
.
intermediate_size
;
auto
d
=
weights
->
meta
->
text_meta
.
hidden_size
;
// [ndev, 2*di // ndev, d]
for
(
int
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
auto
weight
=
weights
->
device_weights
[
idev
];
size_t
offset
=
idev
*
(
2
*
di
/
ndev
)
*
d
*
dsize
(
weights
->
meta
->
dtype
);
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
mlp_gate_up
->
load
((
char
*
)
cpu_ptr
+
offset
,
weight
->
load_stream
);
if
(
weights
->
transpose_weight
)
{
weight
->
w_lang
->
layers
[
layer
].
mlp_gate_up
=
weight
->
w_lang
->
layers
[
layer
].
mlp_gate_up
->
permute
({
1
,
0
});
//[d, 2*di/ndev]
}
}
}
void
load_mlp_down
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading mlp down "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
int
ndev
=
int
(
weights
->
device_weights
.
size
());
auto
di
=
weights
->
meta
->
text_meta
.
intermediate_size
;
auto
d
=
weights
->
meta
->
text_meta
.
hidden_size
;
//[ndev, d, di // ndev]
for
(
int
idev
=
0
;
idev
<
ndev
;
idev
++
)
{
auto
weight
=
weights
->
device_weights
[
idev
];
size_t
offset
=
idev
*
d
*
(
di
/
ndev
)
*
dsize
(
weights
->
meta
->
dtype
);
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_lang
->
layers
[
layer
].
mlp_down
->
load
((
char
*
)
cpu_ptr
+
offset
,
weight
->
load_stream
);
if
(
weights
->
transpose_weight
)
{
weight
->
w_lang
->
layers
[
layer
].
mlp_down
=
weight
->
w_lang
->
layers
[
layer
].
mlp_down
->
permute
({
1
,
0
});
//[di/ndev, d]
}
}
}
// --- Vision weights
void
load_patch_embed_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading patch embed weight from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
patch_embed_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_patch_embed_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading patch embed bias from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
patch_embed_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_pos_embed_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading pos embed weight from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
pos_embed_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
// Vision block attention
void
load_attn_proj_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision attn proj weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
attn_proj_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_attn_proj_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision attn proj bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
attn_proj_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_attn_qkv_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision attn qkv weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
attn_qkv_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_attn_qkv_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision attn qkv bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
attn_qkv_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
// Vision block mlp
void
load_mlp_linear_fc1_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision mlp fc1 weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
mlp_linear_fc1_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_mlp_linear_fc1_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision mlp fc1 bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
mlp_linear_fc1_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_mlp_linear_fc2_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision mlp fc2 weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
mlp_linear_fc2_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_mlp_linear_fc2_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision mlp fc2 bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
mlp_linear_fc2_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
// Vision block norm
void
load_norm1_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision norm1 weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
norm1_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_norm1_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision norm1 bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
norm1_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_norm2_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision norm2 weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
norm2_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_norm2_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading vision norm2 bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
blocks
[
layer
].
norm2_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
// Deepstack merger
void
load_deepstack_merger_linear_fc1_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading deepstack merger fc1 weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
deepstack_mergers
[
layer
].
linear_fc1_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_deepstack_merger_linear_fc1_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading deepstack merger fc1 bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
deepstack_mergers
[
layer
].
linear_fc1_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_deepstack_merger_linear_fc2_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading deepstack merger fc2 weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
deepstack_mergers
[
layer
].
linear_fc2_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_deepstack_merger_linear_fc2_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading deepstack merger fc2 bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
deepstack_mergers
[
layer
].
linear_fc2_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_deepstack_merger_norm_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading deepstack merger norm weight "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
deepstack_mergers
[
layer
].
norm_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_deepstack_merger_norm_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
,
size_t
layer
)
{
std
::
cout
<<
"Loading deepstack merger norm bias "
<<
layer
<<
" from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
deepstack_mergers
[
layer
].
norm_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
// Merger
void
load_merger_linear_fc1_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading merger fc1 weight from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
merger
->
linear_fc1_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_merger_linear_fc1_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading merger fc1 bias from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
merger
->
linear_fc1_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_merger_linear_fc2_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading merger fc2 weight from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
merger
->
linear_fc2_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_merger_linear_fc2_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading merger fc2 bias from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
merger
->
linear_fc2_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_merger_norm_weight
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading merger norm weight from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
merger
->
norm_weight
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
void
load_merger_norm_bias
(
Qwen3vlWeights
*
weights
,
void
*
cpu_ptr
)
{
std
::
cout
<<
"Loading merger norm bias from "
<<
cpu_ptr
<<
std
::
endl
;
for
(
int
dev
=
0
;
dev
<
int
(
weights
->
device_weights
.
size
());
dev
++
)
{
auto
weight
=
weights
->
device_weights
[
dev
];
RUN_INFINI
(
infinirtSetDevice
(
weight
->
device
,
weight
->
dev_id
));
weight
->
w_vis
->
merger
->
norm_bias
->
load
(
cpu_ptr
,
weight
->
load_stream
);
}
}
static
Qwen3vlWeightLoader
weight_loader
=
{
// Language model loaders
.
lang_loader
=
{
.
load_input_embd
=
load_input_embd
,
.
load_output_norm
=
load_output_norm
,
.
load_output_embd
=
load_output_embd
,
.
load_attn_norm
=
load_attn_norm
,
.
load_attn_q_norm
=
load_attn_q_norm
,
.
load_attn_k_norm
=
load_attn_k_norm
,
.
load_attn_qkv_proj
=
load_attn_qkv_proj
,
.
load_attn_o_proj
=
load_attn_o_proj
,
.
load_mlp_norm
=
load_mlp_norm
,
.
load_mlp_gate_up
=
load_mlp_gate_up
,
.
load_mlp_down
=
load_mlp_down
,
},
// Vision model loaders
.
vis_loader
=
{
.
load_patch_embed_weight
=
load_patch_embed_weight
,
.
load_patch_embed_bias
=
load_patch_embed_bias
,
.
load_pos_embed_weight
=
load_pos_embed_weight
,
.
load_attn_proj_weight
=
load_attn_proj_weight
,
.
load_attn_proj_bias
=
load_attn_proj_bias
,
.
load_attn_qkv_weight
=
load_attn_qkv_weight
,
.
load_attn_qkv_bias
=
load_attn_qkv_bias
,
.
load_mlp_linear_fc1_weight
=
load_mlp_linear_fc1_weight
,
.
load_mlp_linear_fc1_bias
=
load_mlp_linear_fc1_bias
,
.
load_mlp_linear_fc2_weight
=
load_mlp_linear_fc2_weight
,
.
load_mlp_linear_fc2_bias
=
load_mlp_linear_fc2_bias
,
.
load_norm1_weight
=
load_norm1_weight
,
.
load_norm1_bias
=
load_norm1_bias
,
.
load_norm2_weight
=
load_norm2_weight
,
.
load_norm2_bias
=
load_norm2_bias
,
.
load_deepstack_merger_linear_fc1_weight
=
load_deepstack_merger_linear_fc1_weight
,
.
load_deepstack_merger_linear_fc1_bias
=
load_deepstack_merger_linear_fc1_bias
,
.
load_deepstack_merger_linear_fc2_weight
=
load_deepstack_merger_linear_fc2_weight
,
.
load_deepstack_merger_linear_fc2_bias
=
load_deepstack_merger_linear_fc2_bias
,
.
load_deepstack_merger_norm_weight
=
load_deepstack_merger_norm_weight
,
.
load_deepstack_merger_norm_bias
=
load_deepstack_merger_norm_bias
,
.
load_merger_linear_fc1_weight
=
load_merger_linear_fc1_weight
,
.
load_merger_linear_fc1_bias
=
load_merger_linear_fc1_bias
,
.
load_merger_linear_fc2_weight
=
load_merger_linear_fc2_weight
,
.
load_merger_linear_fc2_bias
=
load_merger_linear_fc2_bias
,
.
load_merger_norm_weight
=
load_merger_norm_weight
,
.
load_merger_norm_bias
=
load_merger_norm_bias
,
}};
__INFINI_C
Qwen3vlWeights
*
createQwen3vlWeights
(
const
Qwen3vlMeta
*
meta
,
infiniDevice_t
device
,
int
ndev
,
const
int
*
dev_ids
,
bool
transpose_weight
)
{
printf
(
"=== C++ createQwen3vlWeights ===
\n
"
);
printf
(
"sizeof(Qwen3vlTextMeta): %zu
\n
"
,
sizeof
(
Qwen3vlTextMeta
));
printf
(
"sizeof(Qwen3vlVisMeta): %zu
\n
"
,
sizeof
(
Qwen3vlVisMeta
));
printf
(
"sizeof(Qwen3vlMeta): %zu
\n
"
,
sizeof
(
Qwen3vlMeta
));
printf
(
"meta->dtype: %d
\n
"
,
meta
->
dtype
);
printf
(
"meta->text_meta.hidden_size: %zu
\n
"
,
meta
->
text_meta
.
hidden_size
);
printf
(
"meta->text_meta.num_hidden_layers: %zu
\n
"
,
meta
->
text_meta
.
num_hidden_layers
);
printf
(
"meta->text_meta.vocab_size: %zu
\n
"
,
meta
->
text_meta
.
vocab_size
);
printf
(
"meta->vis_meta.depth: %zu
\n
"
,
meta
->
vis_meta
.
depth
);
printf
(
"device: %d, ndev: %d, dev_ids[0]: %d
\n
"
,
device
,
ndev
,
dev_ids
[
0
]);
fflush
(
stdout
);
auto
weights
=
new
Qwen3vlWeights
(
meta
,
device
,
ndev
,
dev_ids
,
transpose_weight
);
return
weights
;
};
__INFINI_C
Qwen3vlWeightLoader
*
createQwen3vlWeightLoader
()
{
return
&
weight_loader
;
}
src/tensor/tensor.cpp
View file @
cfe4b1a8
...
...
@@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector<size_t> &shape,
std
::
cout
<<
std
::
endl
;
}
else
if
(
dim
<
shape
.
size
()
-
1
)
{
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
print_data
(
data
+
i
*
strides
[
dim
],
shape
,
strides
,
dim
+
1
);
print_data
_bf16
(
data
+
i
*
strides
[
dim
],
shape
,
strides
,
dim
+
1
);
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment