Unverified Commit cfe4b1a8 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #267 from InfiniTensor/issue/263_T2-1-4

【比赛2025秋】T2-1-4 qwen3vl
parents 66bfd282 b1f6af34
...@@ -4,7 +4,11 @@ ...@@ -4,7 +4,11 @@
#include "infinicore_infer/cache.h" #include "infinicore_infer/cache.h"
#include "infinicore_infer/weights_loader.h" #include "infinicore_infer/weights_loader.h"
#include "infinicore_infer/models/deepseek.h" #include "infinicore_infer/models/deepseek.h"
#include "infinicore_infer/models/jiuge.h" #include "infinicore_infer/models/jiuge.h"
#include "infinicore_infer/models/jiuge_awq.h"
#include "infinicore_infer/models/qwen3vl.h"
#endif /* INFINICORE_INFER_H */ #endif /* INFINICORE_INFER_H */
#ifndef QWEN3VL_WEIGHTS_H
#define QWEN3VL_WEIGHTS_H
#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>
#include <stddef.h>
#include <stdint.h>
struct Qwen3vlWeights;
// Function pointer signatures
typedef void (*qwen3vl_load_global_fn)(Qwen3vlWeights *, void *cpu_ptr);
typedef void (*qwen3vl_load_layer_fn)(Qwen3vlWeights *, void *cpu_ptr, size_t layer_id);
// Struct containing all weight loading functions
typedef struct {
// Global
qwen3vl_load_global_fn load_input_embd;
qwen3vl_load_global_fn load_output_norm;
qwen3vl_load_global_fn load_output_embd;
// Attention
qwen3vl_load_layer_fn load_attn_norm;
qwen3vl_load_layer_fn load_attn_q_norm;
qwen3vl_load_layer_fn load_attn_k_norm;
qwen3vl_load_layer_fn load_attn_qkv_proj;
qwen3vl_load_layer_fn load_attn_o_proj;
// MLP
qwen3vl_load_layer_fn load_mlp_norm;
qwen3vl_load_layer_fn load_mlp_gate_up;
qwen3vl_load_layer_fn load_mlp_down;
} Qwen3vlLangWeightLoader;
typedef struct {
// Patch_embed
qwen3vl_load_global_fn load_patch_embed_weight;
qwen3vl_load_global_fn load_patch_embed_bias;
qwen3vl_load_global_fn load_pos_embed_weight;
// blocks attn
qwen3vl_load_layer_fn load_attn_proj_weight;
qwen3vl_load_layer_fn load_attn_proj_bias;
qwen3vl_load_layer_fn load_attn_qkv_weight;
qwen3vl_load_layer_fn load_attn_qkv_bias;
// block mlp
qwen3vl_load_layer_fn load_mlp_linear_fc1_weight;
qwen3vl_load_layer_fn load_mlp_linear_fc1_bias;
qwen3vl_load_layer_fn load_mlp_linear_fc2_weight;
qwen3vl_load_layer_fn load_mlp_linear_fc2_bias;
// block norm
qwen3vl_load_layer_fn load_norm1_weight;
qwen3vl_load_layer_fn load_norm1_bias;
qwen3vl_load_layer_fn load_norm2_weight;
qwen3vl_load_layer_fn load_norm2_bias;
// deepstack_merger
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_weight;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_bias;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_weight;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_bias;
qwen3vl_load_layer_fn load_deepstack_merger_norm_weight;
qwen3vl_load_layer_fn load_deepstack_merger_norm_bias;
// merger
qwen3vl_load_global_fn load_merger_linear_fc1_weight;
qwen3vl_load_global_fn load_merger_linear_fc1_bias;
qwen3vl_load_global_fn load_merger_linear_fc2_weight;
qwen3vl_load_global_fn load_merger_linear_fc2_bias;
qwen3vl_load_global_fn load_merger_norm_weight;
qwen3vl_load_global_fn load_merger_norm_bias;
} Qwen3vlVisWeightLoader;
typedef struct {
Qwen3vlLangWeightLoader lang_loader;
Qwen3vlVisWeightLoader vis_loader;
} Qwen3vlWeightLoader;
struct Qwen3vlModel;
typedef struct {
size_t bos_token_id;
size_t eos_token_id;
size_t head_dim;
size_t hidden_size;
float initializer_range;
size_t intermediate_size;
size_t max_tokens;
size_t num_attention_heads;
size_t num_hidden_layers;
size_t num_key_value_heads;
float rms_norm_eps;
size_t mrope_section[3];
size_t rope_theta;
size_t vocab_size;
} Qwen3vlTextMeta;
typedef struct {
size_t depth;
size_t deepstack_visual_indexes[3];
size_t hidden_size;
size_t in_channels;
float initializer_range;
size_t intermediate_size;
size_t num_heads;
size_t num_position_embeddings;
size_t out_hidden_size;
size_t patch_size;
size_t spatial_merge_size;
size_t temporal_patch_size;
} Qwen3vlVisMeta;
typedef struct {
infiniDtype_t dtype; // INFINI_DTYPE_BF16
Qwen3vlTextMeta text_meta;
Qwen3vlVisMeta vis_meta;
size_t image_token_id;
size_t video_token_id;
size_t vision_end_token_id;
size_t vision_start_token_id;
} Qwen3vlMeta;
//////////////////// APIs ///////////////////////
/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__INFINI_C __export struct Qwen3vlModel *
createQwen3vlModel(const Qwen3vlMeta *,
const Qwen3vlWeights *);
__INFINI_C Qwen3vlWeights *
createQwen3vlWeights(const Qwen3vlMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids,
bool transpose_weight);
__INFINI_C __export Qwen3vlWeightLoader *
createQwen3vlWeightLoader();
/// @brief 销毁模型
__INFINI_C __export void destroyQwen3vlModel(struct Qwen3vlModel *);
__INFINI_C __export struct Qwen3vlCache *
createQwen3vlCache(const struct Qwen3vlModel *);
__INFINI_C __export void
dropQwen3vlCache(const struct Qwen3vlModel *,
struct Qwen3vlCache *);
/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
__INFINI_C __export void
inferBatchQwen3vl(struct Qwen3vlModel *,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output);
/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
__INFINI_C __export void
forwardBatchQwen3vl(struct Qwen3vlModel *,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **caches,
void *logits);
#endif // QWEN3VL_WEIGHTS_H
...@@ -8,6 +8,17 @@ from .deepseek_v3 import ( ...@@ -8,6 +8,17 @@ from .deepseek_v3 import (
DeepSeekV3WeightLoaderCStruct, DeepSeekV3WeightLoaderCStruct,
DeepSeekV3CacheCStruct, DeepSeekV3CacheCStruct,
) )
from .qwen3vl import (
Qwen3vlModel,
Qwen3vlMetaCStruct,
TextMetaCStruct,
VisMetaCStruct,
Qwen3vlWeightsCStruct,
Qwen3vlWeightLoaderCStruct,
Qwen3vlVisWeightLoaderCStruct,
Qwen3vlLangWeightLoaderCStruct,
Qwen3vlCacheCStruct,
)
__all__ = [ __all__ = [
"DataType", "DataType",
...@@ -23,5 +34,15 @@ __all__ = [ ...@@ -23,5 +34,15 @@ __all__ = [
"DeepSeekV3MetaCStruct", "DeepSeekV3MetaCStruct",
"DeepSeekV3WeightsCStruct", "DeepSeekV3WeightsCStruct",
"DeepSeekV3WeightLoaderCStruct", "DeepSeekV3WeightLoaderCStruct",
"DeepSeekV3CacheCStruct",
"Qwen3vlModel",
"Qwen3vlMetaCStruct",
"TextMetaCStruct",
"VisMetaCStruct",
"Qwen3vlWeightsCStruct",
"Qwen3vlWeightLoaderCStruct",
"Qwen3vlVisWeightLoaderCStruct",
"Qwen3vlLangWeightLoaderCStruct",
"Qwen3vlCacheCStruct",
"ModelRegister", "ModelRegister",
] ]
from .base import BaseModel, DataType, DeviceType, KVCacheCStruct, register_model
from ctypes import (
c_size_t,
c_uint,
c_uint16,
c_int,
c_float,
c_void_p,
c_bool,
POINTER,
Structure,
CFUNCTYPE,
)
class TextMetaCStruct(Structure):
_fields_ = [
("bos_token_id", c_size_t),
("eos_token_id", c_size_t),
("head_dim", c_size_t),
("hidden_size", c_size_t),
("initializer_range", c_float),
("_pad1", c_float),
("intermediate_size", c_size_t),
("max_tokens", c_size_t),
("num_attention_heads", c_size_t),
("num_hidden_layers", c_size_t),
("num_key_value_heads", c_size_t),
("rms_norm_eps", c_float),
("_pad2", c_float),
("mrope_section", c_size_t * 3),
("rope_theta", c_size_t),
("vocab_size", c_size_t),
]
class VisMetaCStruct(Structure):
_fields_ = [
("depth", c_size_t),
("deepstack_visual_indexes", c_size_t * 3),
("hidden_size", c_size_t),
("in_channels", c_size_t),
("initializer_range", c_float),
("_pad1", c_float),
("intermediate_size", c_size_t),
("num_heads", c_size_t),
("num_position_embeddings", c_size_t),
("out_hidden_size", c_size_t),
("patch_size", c_size_t),
("spatial_merge_size", c_size_t),
("temporal_patch_size", c_size_t),
]
class Qwen3vlMetaCStruct(Structure):
_fields_ = [
("dtype", DataType),
("_pad_dtype", c_uint),
("text_meta", TextMetaCStruct),
("vis_meta", VisMetaCStruct),
# Token ids
("image_token_id", c_size_t),
("video_token_id", c_size_t),
("vision_end_token_id", c_size_t),
("vision_start_token_id", c_size_t),
]
class Qwen3vlWeightsCStruct(Structure):
pass
class Qwen3vlModelCStruct(Structure):
pass
class Qwen3vlCacheCStruct(Structure):
pass
load_global_fn = CFUNCTYPE(None, POINTER(Qwen3vlWeightsCStruct), c_void_p)
load_layer_fn = CFUNCTYPE(None, POINTER(Qwen3vlWeightsCStruct), c_void_p, c_size_t)
class Qwen3vlLangWeightLoaderCStruct(Structure):
_fields_ = [
# Global
("load_input_embd", load_global_fn),
("load_output_norm", load_global_fn),
("load_output_embd", load_global_fn),
# Attention
("load_attn_norm", load_layer_fn),
("load_attn_q_norm", load_layer_fn),
("load_attn_k_norm", load_layer_fn),
("load_attn_qkv_proj", load_layer_fn),
("load_attn_o_proj", load_layer_fn),
# MLP
("load_mlp_norm", load_layer_fn),
("load_mlp_gate_up", load_layer_fn),
("load_mlp_down", load_layer_fn),
]
class Qwen3vlVisWeightLoaderCStruct(Structure):
_fields_ = [
# Patch embed
("load_patch_embed_weight", load_global_fn),
("load_patch_embed_bias", load_global_fn),
("load_pos_embed_weight", load_global_fn),
# Blocks attention
("load_attn_proj_weight", load_layer_fn),
("load_attn_proj_bias", load_layer_fn),
("load_attn_qkv_weight", load_layer_fn),
("load_attn_qkv_bias", load_layer_fn),
# Blocks MLP
("load_mlp_linear_fc1_weight", load_layer_fn),
("load_mlp_linear_fc1_bias", load_layer_fn),
("load_mlp_linear_fc2_weight", load_layer_fn),
("load_mlp_linear_fc2_bias", load_layer_fn),
# Blocks norm
("load_norm1_weight", load_layer_fn),
("load_norm1_bias", load_layer_fn),
("load_norm2_weight", load_layer_fn),
("load_norm2_bias", load_layer_fn),
# Deepstack merger
("load_deepstack_merger_linear_fc1_weight", load_layer_fn),
("load_deepstack_merger_linear_fc1_bias", load_layer_fn),
("load_deepstack_merger_linear_fc2_weight", load_layer_fn),
("load_deepstack_merger_linear_fc2_bias", load_layer_fn),
("load_deepstack_merger_norm_weight", load_layer_fn),
("load_deepstack_merger_norm_bias", load_layer_fn),
# Merger
("load_merger_linear_fc1_weight", load_global_fn),
("load_merger_linear_fc1_bias", load_global_fn),
("load_merger_linear_fc2_weight", load_global_fn),
("load_merger_linear_fc2_bias", load_global_fn),
("load_merger_norm_weight", load_global_fn),
("load_merger_norm_bias", load_global_fn),
]
class Qwen3vlWeightLoaderCStruct(Structure):
_fields_ = [
("lang_loader", Qwen3vlLangWeightLoaderCStruct),
("vis_loader", Qwen3vlVisWeightLoaderCStruct),
]
@register_model
class Qwen3vlModel(BaseModel):
@classmethod
def register_lib(cls, lib):
"""Register Qwen3vl model functions with the library"""
lib.createQwen3vlWeightLoader.argtypes = []
lib.createQwen3vlWeightLoader.restype = POINTER(Qwen3vlWeightLoaderCStruct)
lib.createQwen3vlWeights.argtypes = [
POINTER(Qwen3vlMetaCStruct),
DeviceType,
c_int,
POINTER(c_int),
c_bool,
]
lib.createQwen3vlWeights.restype = POINTER(Qwen3vlWeightsCStruct)
lib.createQwen3vlModel.argtypes = [
POINTER(Qwen3vlMetaCStruct),
POINTER(Qwen3vlWeightsCStruct),
]
lib.createQwen3vlModel.restype = POINTER(Qwen3vlModelCStruct)
lib.destroyQwen3vlModel.argtypes = [POINTER(Qwen3vlModelCStruct)]
lib.createQwen3vlCache.argtypes = [POINTER(Qwen3vlModelCStruct)]
lib.createQwen3vlCache.restype = POINTER(Qwen3vlCacheCStruct)
lib.dropQwen3vlCache.argtypes = [
POINTER(Qwen3vlModelCStruct),
POINTER(Qwen3vlCacheCStruct),
]
lib.inferBatchQwen3vl.argtypes = [
POINTER(Qwen3vlModelCStruct),
POINTER(c_uint),
c_uint,
c_void_p, # pixel_values,
c_uint, # total_patches,
POINTER(c_uint), # image_grid_thw,
c_uint, # num_images,
c_void_p, # pixel_values_videos,
c_uint, # total_patches_videos,
POINTER(c_uint), # video_grid_thw,
c_uint, # num_videos,
c_uint, # patch_features,
POINTER(c_uint),
c_uint,
POINTER(c_uint),
POINTER(POINTER(Qwen3vlCacheCStruct)),
POINTER(c_float),
POINTER(c_uint),
POINTER(c_float),
POINTER(c_uint),
]
lib.forwardBatchQwen3vl.argtypes = [
POINTER(Qwen3vlModelCStruct),
POINTER(c_uint),
c_uint,
c_void_p, # pixel_values,
c_uint, # total_patches,
POINTER(c_uint), # image_grid_thw,
c_uint, # num_images,
c_void_p, # pixel_values_videos,
c_uint, # total_patches_videos,
POINTER(c_uint), # video_grid_thw,
c_uint, # num_videos,
c_uint, # patch_features,
POINTER(c_uint),
c_uint,
POINTER(c_uint),
POINTER(POINTER(Qwen3vlCacheCStruct)),
c_void_p,
]
def create_weight_loader(self):
return self.lib.createQwen3vlWeightLoader()
def create_weights(self, meta, device_type, ndev, dev_ids, transpose_weight):
return self.lib.createQwen3vlWeights(
meta, device_type, ndev, dev_ids, transpose_weight
)
def create_model(self, meta, weights):
return self.lib.createQwen3vlModel(meta, weights)
def destroy_model(self, model):
self.lib.destroyQwen3vlModel(model)
def create_cache(self, model):
return self.lib.createQwen3vlCache(model)
def drop_cache(self, model, cache):
self.lib.dropQwen3vlCache(model, cache)
def infer_batch(
self,
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
temperature,
topk,
topp,
output,
):
self.lib.inferBatchQwen3vl(
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
temperature,
topk,
topp,
output,
)
def forward_batch(
self,
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
logits,
):
self.lib.forwardBatchQwen3vl(
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
logits,
)
This diff is collapsed.
...@@ -16,7 +16,7 @@ public: ...@@ -16,7 +16,7 @@ public:
class MemoryPool : public AllocatorBase { class MemoryPool : public AllocatorBase {
public: public:
static constexpr size_t DEFAULT_ALIGNMENT = 256; static constexpr size_t DEFAULT_ALIGNMENT = 512;
explicit MemoryPool(size_t initialSize = 0, size_t alignment = DEFAULT_ALIGNMENT); explicit MemoryPool(size_t initialSize = 0, size_t alignment = DEFAULT_ALIGNMENT);
~MemoryPool(); ~MemoryPool();
......
...@@ -153,6 +153,8 @@ public: ...@@ -153,6 +153,8 @@ public:
class CacheManager { class CacheManager {
public: public:
DECLARE_OP_CACHE(Add) DECLARE_OP_CACHE(Add)
DECLARE_OP_CACHE(Conv)
DECLARE_OP_CACHE(Mul)
DECLARE_OP_CACHE(RMSNorm) DECLARE_OP_CACHE(RMSNorm)
DECLARE_OP_CACHE(Gemm) DECLARE_OP_CACHE(Gemm)
DECLARE_OP_CACHE(RoPE) DECLARE_OP_CACHE(RoPE)
...@@ -160,11 +162,14 @@ public: ...@@ -160,11 +162,14 @@ public:
DECLARE_OP_CACHE(CausalSoftmax) DECLARE_OP_CACHE(CausalSoftmax)
DECLARE_OP_CACHE(Topkrouter) DECLARE_OP_CACHE(Topkrouter)
DECLARE_OP_CACHE(SwiGLU) DECLARE_OP_CACHE(SwiGLU)
DECLARE_OP_CACHE(Silu)
DECLARE_OP_CACHE(RandomSample) DECLARE_OP_CACHE(RandomSample)
DECLARE_OP_CACHE(DequantizeAWQ) DECLARE_OP_CACHE(DequantizeAWQ)
CacheManager(size_t capacity = 100) CacheManager(size_t capacity = 100)
: Add_cache(capacity, DESTROY_FUNC(Add)), : Add_cache(capacity, DESTROY_FUNC(Add)),
Conv_cache(capacity, DESTROY_FUNC(Conv)),
Mul_cache(capacity, DESTROY_FUNC(Mul)),
RMSNorm_cache(capacity, DESTROY_FUNC(RMSNorm)), RMSNorm_cache(capacity, DESTROY_FUNC(RMSNorm)),
Gemm_cache(capacity, DESTROY_FUNC(Gemm)), Gemm_cache(capacity, DESTROY_FUNC(Gemm)),
RoPE_cache(capacity, DESTROY_FUNC(RoPE)), RoPE_cache(capacity, DESTROY_FUNC(RoPE)),
...@@ -172,6 +177,7 @@ public: ...@@ -172,6 +177,7 @@ public:
CausalSoftmax_cache(capacity, DESTROY_FUNC(CausalSoftmax)), CausalSoftmax_cache(capacity, DESTROY_FUNC(CausalSoftmax)),
Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)), Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)),
SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)), SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)),
Silu_cache(capacity, DESTROY_FUNC(Silu)),
RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)), RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)),
DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {} DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {}
......
...@@ -33,6 +33,61 @@ void InferenceContext::add(std::shared_ptr<Tensor> c, ...@@ -33,6 +33,61 @@ void InferenceContext::add(std::shared_ptr<Tensor> c,
c->data(), a->data(), b->data(), stream)); c->data(), a->data(), b->data(), stream));
} }
void InferenceContext::conv(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w,
std::shared_ptr<Tensor> bias,
void *pads,
void *strides,
void *dilations,
size_t n) {
size_t key = CacheManager::createDescriptorKey(y, x, w, bias);
// Combine additional parameters into the key for unique identification
hash_combine(key, std::hash<void *>()(pads));
hash_combine(key, std::hash<void *>()(strides));
hash_combine(key, std::hash<void *>()(dilations));
hash_combine(key, std::hash<size_t>()(n));
infiniopConvDescriptor_t desc;
if (!cache_manager->getConvDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateConvDescriptor(
op_handle, &desc, y->desc(), x->desc(), w->desc(),
bias ? bias->desc() : nullptr, pads, strides, dilations, n));
cache_manager->putConvDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetConvWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopConv(
desc, workspace, workspace_size,
y->data(), x->data(), w->data(),
bias ? bias->data() : nullptr, stream));
}
void InferenceContext::mul(std::shared_ptr<Tensor> c,
std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> b) {
size_t key = CacheManager::createDescriptorKey(c, a, b);
infiniopMulDescriptor_t desc;
if (!cache_manager->getMulDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateMulDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
cache_manager->putMulDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetMulWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopMul(
desc, workspace, workspace_size,
c->data(), a->data(), b->data(), stream));
}
void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y, void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w, std::shared_ptr<Tensor> w,
...@@ -189,6 +244,26 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out, ...@@ -189,6 +244,26 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
out->data(), up->data(), gate->data(), stream)); out->data(), up->data(), gate->data(), stream));
} }
void InferenceContext::silu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> input) {
size_t key = CacheManager::createDescriptorKey(out, input);
infiniopSiluDescriptor_t desc;
if (!cache_manager->getSiluDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateSiluDescriptor(
op_handle, &desc, out->desc(), input->desc()));
cache_manager->putSiluDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetSiluWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopSilu(desc, workspace, workspace_size,
out->data(), input->data(), stream));
}
void InferenceContext::randomSample(std::shared_ptr<Tensor> out, void InferenceContext::randomSample(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> prob, std::shared_ptr<Tensor> prob,
float random_val, float top_p, uint32_t top_k, float temperature) { float random_val, float top_p, uint32_t top_k, float temperature) {
......
...@@ -19,6 +19,14 @@ struct InferenceContext { ...@@ -19,6 +19,14 @@ struct InferenceContext {
void add(std::shared_ptr<Tensor> c, void add(std::shared_ptr<Tensor> c,
std::shared_ptr<Tensor> a, std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> b); std::shared_ptr<Tensor> b);
void conv(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w,
std::shared_ptr<Tensor> bias,
void *pads, void *strides, void *dilations, size_t n);
void mul(std::shared_ptr<Tensor> c,
std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> b);
void rmsnorm(std::shared_ptr<Tensor> y, void rmsnorm(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w, std::shared_ptr<Tensor> w,
...@@ -48,6 +56,8 @@ struct InferenceContext { ...@@ -48,6 +56,8 @@ struct InferenceContext {
void swiglu(std::shared_ptr<Tensor> out, void swiglu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> up, std::shared_ptr<Tensor> up,
std::shared_ptr<Tensor> gate); std::shared_ptr<Tensor> gate);
void silu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> input);
void randomSample(std::shared_ptr<Tensor> out, void randomSample(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> prob, std::shared_ptr<Tensor> prob,
float random_val, float top_p, uint32_t top_k, float temperature); float random_val, float top_p, uint32_t top_k, float temperature);
...@@ -81,6 +91,15 @@ inline void add(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::share ...@@ -81,6 +91,15 @@ inline void add(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::share
getInferenceContext().add(c, a, b); getInferenceContext().add(c, a, b);
} }
inline void conv(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, std::shared_ptr<Tensor> w, std::shared_ptr<Tensor> bias,
void *pads, void *strides, void *dilations, size_t n) {
getInferenceContext().conv(y, x, w, bias, pads, strides, dilations, n);
}
inline void mul(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::shared_ptr<Tensor> b) {
getInferenceContext().mul(c, a, b);
}
inline void rmsnorm(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, inline void rmsnorm(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w, float epsilon) { std::shared_ptr<Tensor> w, float epsilon) {
getInferenceContext().rmsnorm(y, x, w, epsilon); getInferenceContext().rmsnorm(y, x, w, epsilon);
...@@ -131,6 +150,10 @@ inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up, ...@@ -131,6 +150,10 @@ inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up,
getInferenceContext().swiglu(out, up, gate); getInferenceContext().swiglu(out, up, gate);
} }
inline void silu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> input) {
getInferenceContext().silu(out, input);
}
inline void randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> prob, inline void randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> prob,
float random_val, float top_p, uint32_t top_k, float temperature) { float random_val, float top_p, uint32_t top_k, float temperature) {
getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature); getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature);
......
This diff is collapsed.
#include "qwen3vl_impl.hpp"
__INFINI_C struct Qwen3vlCache *
createQwen3vlCache(const struct Qwen3vlModel *model) {
Qwen3vlCache *cache = new Qwen3vlCache();
auto ndev = model->dev_resources.size();
auto nlayer = model->meta.text_meta.num_hidden_layers;
auto max_len = model->meta.text_meta.max_tokens;
auto dh = model->meta.text_meta.head_dim;
auto nkv = model->meta.text_meta.num_key_value_heads / size_t(ndev);
auto k_rot_shape = std::vector<size_t>{max_len, nkv, dh};
auto v_shape = std::vector<size_t>{max_len, nkv, dh};
for (size_t idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
auto k_rot_cache = std::vector<std::shared_ptr<Tensor>>();
auto v_cache = std::vector<std::shared_ptr<Tensor>>();
for (size_t layer = 0; layer < nlayer; layer++) {
k_rot_cache.push_back(std::move(Tensor::buffer(model->meta.dtype, k_rot_shape)));
v_cache.push_back(std::move(Tensor::buffer(model->meta.dtype, v_shape)));
}
cache->k_rot.push_back(k_rot_cache);
cache->v.push_back(v_cache);
}
return cache;
}
//////还有visual deepstack需要cache?
__INFINI_C void
dropQwen3vlCache(const struct Qwen3vlModel *model,
struct Qwen3vlCache *cache) {
auto ndev = model->dev_resources.size();
auto nlayer = model->meta.text_meta.num_hidden_layers;
for (size_t idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
for (size_t layer = 0; layer < nlayer; layer++) {
cache->k_rot[idev][layer].reset();
cache->v[idev][layer].reset();
}
}
delete cache;
}
#ifndef QWEN3VL_IMPL_H
#define QWEN3VL_IMPL_H
#include "infinicore_infer.h"
#include "../../allocator.hpp"
#include "../../tensor.hpp"
#include <condition_variable>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
struct Qwen3vlLayerWeight {
std::shared_ptr<Tensor> attn_norm;
std::shared_ptr<Tensor> attn_qkv_proj;
std::shared_ptr<Tensor> attn_q_norm;
std::shared_ptr<Tensor> attn_k_norm;
std::shared_ptr<Tensor> attn_o_proj;
std::shared_ptr<Tensor> mlp_norm;
std::shared_ptr<Tensor> mlp_gate_up;
std::shared_ptr<Tensor> mlp_down;
};
struct Qwen3vlLanguageModelWeight {
std::shared_ptr<Tensor> in_embd, out_embd, out_norm;
std::vector<Qwen3vlLayerWeight> layers;
};
struct Qwen3vlVisBlockWeight {
std::shared_ptr<Tensor> attn_proj_weight, attn_proj_bias, attn_qkv_weight, attn_qkv_bias;
std::shared_ptr<Tensor> mlp_linear_fc1_weight, mlp_linear_fc1_bias, mlp_linear_fc2_weight, mlp_linear_fc2_bias;
std::shared_ptr<Tensor> norm1_weight, norm1_bias, norm2_weight, norm2_bias;
};
struct DeepstackMergerWeight {
std::shared_ptr<Tensor> linear_fc1_weight, linear_fc1_bias, linear_fc2_weight, linear_fc2_bias;
std::shared_ptr<Tensor> norm_weight, norm_bias;
};
struct MergerWeight {
std::shared_ptr<Tensor> linear_fc1_weight, linear_fc1_bias, linear_fc2_weight, linear_fc2_bias;
std::shared_ptr<Tensor> norm_weight, norm_bias;
};
struct Qwen3vlVisualEncoderWeight {
std::shared_ptr<Tensor> patch_embed_weight, patch_embed_bias, pos_embed_weight;
std::vector<Qwen3vlVisBlockWeight> blocks;
std::vector<DeepstackMergerWeight> deepstack_mergers;
std::shared_ptr<MergerWeight> merger;
};
struct Qwen3vlDeviceWeights {
std::shared_ptr<Tensor> sin_table, cos_table;
std::shared_ptr<Qwen3vlLanguageModelWeight> w_lang;
std::shared_ptr<Qwen3vlVisualEncoderWeight> w_vis;
infiniDevice_t device;
int dev_id;
infinirtStream_t load_stream;
};
struct Qwen3vlWeights {
Qwen3vlMeta const *meta;
bool transpose_weight;
std::vector<std::shared_ptr<Qwen3vlDeviceWeights>> device_weights;
Qwen3vlWeights(const Qwen3vlMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids,
bool transpose_weight);
};
struct Qwen3vlDeviceResource {
// Device
infiniDevice_t device;
int device_id;
infiniopHandle_t handle;
// Weights
std::shared_ptr<Qwen3vlDeviceWeights> weights;
// Streams
infinirtStream_t stream;
// Communicator
infinicclComm_t comm;
std::shared_ptr<MemoryPool> memory_pool;
};
struct InferState { // qwen3vl namespace
inline static std::mutex mtx_sync;
inline static int sync_cnt;
inline static std::condition_variable cv_sync;
std::mutex mtx;
std::condition_variable cv_load, cv_start, cv_done;
bool loaded = false;
bool proceed = false;
bool exit_flag = false;
};
struct InferRequest { // qwen3vl namespace
const uint32_t *tokens;
uint32_t ntok;
void *pixel_values;
uint32_t total_patches;
uint32_t *image_grid_thw;
uint32_t num_images;
void *pixel_values_videos;
uint32_t total_patches_videos;
uint32_t *video_grid_thw;
uint32_t num_videos;
uint32_t patch_features;
const uint32_t *req_lens;
uint32_t nreq;
const uint32_t *req_pos;
struct Qwen3vlCache **kv_caches;
const float *temperature;
const uint32_t *topk;
const float *topp;
uint32_t *output;
void *logits;
};
struct Qwen3vlModel {
Qwen3vlMeta meta;
infiniDevice_t device;
std::vector<int> dev_ids;
std::vector<Qwen3vlDeviceResource> dev_resources;
std::vector<InferState> states;
std::vector<std::thread> threads;
InferRequest req;
Qwen3vlModel(const Qwen3vlMeta *, const Qwen3vlWeights *weights);
};
struct Qwen3vlCache {
std::vector<std::vector<std::shared_ptr<Tensor>>> k_rot, v;
};
#endif
This diff is collapsed.
...@@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector<size_t> &shape, ...@@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector<size_t> &shape,
std::cout << std::endl; std::cout << std::endl;
} else if (dim < shape.size() - 1) { } else if (dim < shape.size() - 1) {
for (size_t i = 0; i < shape[dim]; i++) { for (size_t i = 0; i < shape[dim]; i++) {
print_data(data + i * strides[dim], shape, strides, dim + 1); print_data_bf16(data + i * strides[dim], shape, strides, dim + 1);
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment