"include/gtest/vscode:/vscode.git/clone" did not exist on "814a5e9310bbc8aeb0b985c1dcb66496835bf73a"
Unverified Commit cfe4b1a8 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #267 from InfiniTensor/issue/263_T2-1-4

【比赛2025秋】T2-1-4 qwen3vl
parents 66bfd282 b1f6af34
...@@ -4,7 +4,11 @@ ...@@ -4,7 +4,11 @@
#include "infinicore_infer/cache.h" #include "infinicore_infer/cache.h"
#include "infinicore_infer/weights_loader.h" #include "infinicore_infer/weights_loader.h"
#include "infinicore_infer/models/deepseek.h" #include "infinicore_infer/models/deepseek.h"
#include "infinicore_infer/models/jiuge.h" #include "infinicore_infer/models/jiuge.h"
#include "infinicore_infer/models/jiuge_awq.h"
#include "infinicore_infer/models/qwen3vl.h"
#endif /* INFINICORE_INFER_H */ #endif /* INFINICORE_INFER_H */
#ifndef QWEN3VL_WEIGHTS_H
#define QWEN3VL_WEIGHTS_H
#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>
#include <stddef.h>
#include <stdint.h>
struct Qwen3vlWeights;
// Function pointer signatures
typedef void (*qwen3vl_load_global_fn)(Qwen3vlWeights *, void *cpu_ptr);
typedef void (*qwen3vl_load_layer_fn)(Qwen3vlWeights *, void *cpu_ptr, size_t layer_id);
// Struct containing all weight loading functions
typedef struct {
// Global
qwen3vl_load_global_fn load_input_embd;
qwen3vl_load_global_fn load_output_norm;
qwen3vl_load_global_fn load_output_embd;
// Attention
qwen3vl_load_layer_fn load_attn_norm;
qwen3vl_load_layer_fn load_attn_q_norm;
qwen3vl_load_layer_fn load_attn_k_norm;
qwen3vl_load_layer_fn load_attn_qkv_proj;
qwen3vl_load_layer_fn load_attn_o_proj;
// MLP
qwen3vl_load_layer_fn load_mlp_norm;
qwen3vl_load_layer_fn load_mlp_gate_up;
qwen3vl_load_layer_fn load_mlp_down;
} Qwen3vlLangWeightLoader;
typedef struct {
// Patch_embed
qwen3vl_load_global_fn load_patch_embed_weight;
qwen3vl_load_global_fn load_patch_embed_bias;
qwen3vl_load_global_fn load_pos_embed_weight;
// blocks attn
qwen3vl_load_layer_fn load_attn_proj_weight;
qwen3vl_load_layer_fn load_attn_proj_bias;
qwen3vl_load_layer_fn load_attn_qkv_weight;
qwen3vl_load_layer_fn load_attn_qkv_bias;
// block mlp
qwen3vl_load_layer_fn load_mlp_linear_fc1_weight;
qwen3vl_load_layer_fn load_mlp_linear_fc1_bias;
qwen3vl_load_layer_fn load_mlp_linear_fc2_weight;
qwen3vl_load_layer_fn load_mlp_linear_fc2_bias;
// block norm
qwen3vl_load_layer_fn load_norm1_weight;
qwen3vl_load_layer_fn load_norm1_bias;
qwen3vl_load_layer_fn load_norm2_weight;
qwen3vl_load_layer_fn load_norm2_bias;
// deepstack_merger
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_weight;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_bias;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_weight;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_bias;
qwen3vl_load_layer_fn load_deepstack_merger_norm_weight;
qwen3vl_load_layer_fn load_deepstack_merger_norm_bias;
// merger
qwen3vl_load_global_fn load_merger_linear_fc1_weight;
qwen3vl_load_global_fn load_merger_linear_fc1_bias;
qwen3vl_load_global_fn load_merger_linear_fc2_weight;
qwen3vl_load_global_fn load_merger_linear_fc2_bias;
qwen3vl_load_global_fn load_merger_norm_weight;
qwen3vl_load_global_fn load_merger_norm_bias;
} Qwen3vlVisWeightLoader;
typedef struct {
Qwen3vlLangWeightLoader lang_loader;
Qwen3vlVisWeightLoader vis_loader;
} Qwen3vlWeightLoader;
struct Qwen3vlModel;
typedef struct {
size_t bos_token_id;
size_t eos_token_id;
size_t head_dim;
size_t hidden_size;
float initializer_range;
size_t intermediate_size;
size_t max_tokens;
size_t num_attention_heads;
size_t num_hidden_layers;
size_t num_key_value_heads;
float rms_norm_eps;
size_t mrope_section[3];
size_t rope_theta;
size_t vocab_size;
} Qwen3vlTextMeta;
typedef struct {
size_t depth;
size_t deepstack_visual_indexes[3];
size_t hidden_size;
size_t in_channels;
float initializer_range;
size_t intermediate_size;
size_t num_heads;
size_t num_position_embeddings;
size_t out_hidden_size;
size_t patch_size;
size_t spatial_merge_size;
size_t temporal_patch_size;
} Qwen3vlVisMeta;
typedef struct {
infiniDtype_t dtype; // INFINI_DTYPE_BF16
Qwen3vlTextMeta text_meta;
Qwen3vlVisMeta vis_meta;
size_t image_token_id;
size_t video_token_id;
size_t vision_end_token_id;
size_t vision_start_token_id;
} Qwen3vlMeta;
//////////////////// APIs ///////////////////////
/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__INFINI_C __export struct Qwen3vlModel *
createQwen3vlModel(const Qwen3vlMeta *,
const Qwen3vlWeights *);
__INFINI_C Qwen3vlWeights *
createQwen3vlWeights(const Qwen3vlMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids,
bool transpose_weight);
__INFINI_C __export Qwen3vlWeightLoader *
createQwen3vlWeightLoader();
/// @brief 销毁模型
__INFINI_C __export void destroyQwen3vlModel(struct Qwen3vlModel *);
__INFINI_C __export struct Qwen3vlCache *
createQwen3vlCache(const struct Qwen3vlModel *);
__INFINI_C __export void
dropQwen3vlCache(const struct Qwen3vlModel *,
struct Qwen3vlCache *);
/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
__INFINI_C __export void
inferBatchQwen3vl(struct Qwen3vlModel *,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output);
/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
__INFINI_C __export void
forwardBatchQwen3vl(struct Qwen3vlModel *,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **caches,
void *logits);
#endif // QWEN3VL_WEIGHTS_H
...@@ -8,6 +8,17 @@ from .deepseek_v3 import ( ...@@ -8,6 +8,17 @@ from .deepseek_v3 import (
DeepSeekV3WeightLoaderCStruct, DeepSeekV3WeightLoaderCStruct,
DeepSeekV3CacheCStruct, DeepSeekV3CacheCStruct,
) )
from .qwen3vl import (
Qwen3vlModel,
Qwen3vlMetaCStruct,
TextMetaCStruct,
VisMetaCStruct,
Qwen3vlWeightsCStruct,
Qwen3vlWeightLoaderCStruct,
Qwen3vlVisWeightLoaderCStruct,
Qwen3vlLangWeightLoaderCStruct,
Qwen3vlCacheCStruct,
)
__all__ = [ __all__ = [
"DataType", "DataType",
...@@ -23,5 +34,15 @@ __all__ = [ ...@@ -23,5 +34,15 @@ __all__ = [
"DeepSeekV3MetaCStruct", "DeepSeekV3MetaCStruct",
"DeepSeekV3WeightsCStruct", "DeepSeekV3WeightsCStruct",
"DeepSeekV3WeightLoaderCStruct", "DeepSeekV3WeightLoaderCStruct",
"DeepSeekV3CacheCStruct",
"Qwen3vlModel",
"Qwen3vlMetaCStruct",
"TextMetaCStruct",
"VisMetaCStruct",
"Qwen3vlWeightsCStruct",
"Qwen3vlWeightLoaderCStruct",
"Qwen3vlVisWeightLoaderCStruct",
"Qwen3vlLangWeightLoaderCStruct",
"Qwen3vlCacheCStruct",
"ModelRegister", "ModelRegister",
] ]
from .base import BaseModel, DataType, DeviceType, KVCacheCStruct, register_model
from ctypes import (
c_size_t,
c_uint,
c_uint16,
c_int,
c_float,
c_void_p,
c_bool,
POINTER,
Structure,
CFUNCTYPE,
)
class TextMetaCStruct(Structure):
_fields_ = [
("bos_token_id", c_size_t),
("eos_token_id", c_size_t),
("head_dim", c_size_t),
("hidden_size", c_size_t),
("initializer_range", c_float),
("_pad1", c_float),
("intermediate_size", c_size_t),
("max_tokens", c_size_t),
("num_attention_heads", c_size_t),
("num_hidden_layers", c_size_t),
("num_key_value_heads", c_size_t),
("rms_norm_eps", c_float),
("_pad2", c_float),
("mrope_section", c_size_t * 3),
("rope_theta", c_size_t),
("vocab_size", c_size_t),
]
class VisMetaCStruct(Structure):
_fields_ = [
("depth", c_size_t),
("deepstack_visual_indexes", c_size_t * 3),
("hidden_size", c_size_t),
("in_channels", c_size_t),
("initializer_range", c_float),
("_pad1", c_float),
("intermediate_size", c_size_t),
("num_heads", c_size_t),
("num_position_embeddings", c_size_t),
("out_hidden_size", c_size_t),
("patch_size", c_size_t),
("spatial_merge_size", c_size_t),
("temporal_patch_size", c_size_t),
]
class Qwen3vlMetaCStruct(Structure):
_fields_ = [
("dtype", DataType),
("_pad_dtype", c_uint),
("text_meta", TextMetaCStruct),
("vis_meta", VisMetaCStruct),
# Token ids
("image_token_id", c_size_t),
("video_token_id", c_size_t),
("vision_end_token_id", c_size_t),
("vision_start_token_id", c_size_t),
]
class Qwen3vlWeightsCStruct(Structure):
pass
class Qwen3vlModelCStruct(Structure):
pass
class Qwen3vlCacheCStruct(Structure):
pass
load_global_fn = CFUNCTYPE(None, POINTER(Qwen3vlWeightsCStruct), c_void_p)
load_layer_fn = CFUNCTYPE(None, POINTER(Qwen3vlWeightsCStruct), c_void_p, c_size_t)
class Qwen3vlLangWeightLoaderCStruct(Structure):
_fields_ = [
# Global
("load_input_embd", load_global_fn),
("load_output_norm", load_global_fn),
("load_output_embd", load_global_fn),
# Attention
("load_attn_norm", load_layer_fn),
("load_attn_q_norm", load_layer_fn),
("load_attn_k_norm", load_layer_fn),
("load_attn_qkv_proj", load_layer_fn),
("load_attn_o_proj", load_layer_fn),
# MLP
("load_mlp_norm", load_layer_fn),
("load_mlp_gate_up", load_layer_fn),
("load_mlp_down", load_layer_fn),
]
class Qwen3vlVisWeightLoaderCStruct(Structure):
_fields_ = [
# Patch embed
("load_patch_embed_weight", load_global_fn),
("load_patch_embed_bias", load_global_fn),
("load_pos_embed_weight", load_global_fn),
# Blocks attention
("load_attn_proj_weight", load_layer_fn),
("load_attn_proj_bias", load_layer_fn),
("load_attn_qkv_weight", load_layer_fn),
("load_attn_qkv_bias", load_layer_fn),
# Blocks MLP
("load_mlp_linear_fc1_weight", load_layer_fn),
("load_mlp_linear_fc1_bias", load_layer_fn),
("load_mlp_linear_fc2_weight", load_layer_fn),
("load_mlp_linear_fc2_bias", load_layer_fn),
# Blocks norm
("load_norm1_weight", load_layer_fn),
("load_norm1_bias", load_layer_fn),
("load_norm2_weight", load_layer_fn),
("load_norm2_bias", load_layer_fn),
# Deepstack merger
("load_deepstack_merger_linear_fc1_weight", load_layer_fn),
("load_deepstack_merger_linear_fc1_bias", load_layer_fn),
("load_deepstack_merger_linear_fc2_weight", load_layer_fn),
("load_deepstack_merger_linear_fc2_bias", load_layer_fn),
("load_deepstack_merger_norm_weight", load_layer_fn),
("load_deepstack_merger_norm_bias", load_layer_fn),
# Merger
("load_merger_linear_fc1_weight", load_global_fn),
("load_merger_linear_fc1_bias", load_global_fn),
("load_merger_linear_fc2_weight", load_global_fn),
("load_merger_linear_fc2_bias", load_global_fn),
("load_merger_norm_weight", load_global_fn),
("load_merger_norm_bias", load_global_fn),
]
class Qwen3vlWeightLoaderCStruct(Structure):
_fields_ = [
("lang_loader", Qwen3vlLangWeightLoaderCStruct),
("vis_loader", Qwen3vlVisWeightLoaderCStruct),
]
@register_model
class Qwen3vlModel(BaseModel):
@classmethod
def register_lib(cls, lib):
"""Register Qwen3vl model functions with the library"""
lib.createQwen3vlWeightLoader.argtypes = []
lib.createQwen3vlWeightLoader.restype = POINTER(Qwen3vlWeightLoaderCStruct)
lib.createQwen3vlWeights.argtypes = [
POINTER(Qwen3vlMetaCStruct),
DeviceType,
c_int,
POINTER(c_int),
c_bool,
]
lib.createQwen3vlWeights.restype = POINTER(Qwen3vlWeightsCStruct)
lib.createQwen3vlModel.argtypes = [
POINTER(Qwen3vlMetaCStruct),
POINTER(Qwen3vlWeightsCStruct),
]
lib.createQwen3vlModel.restype = POINTER(Qwen3vlModelCStruct)
lib.destroyQwen3vlModel.argtypes = [POINTER(Qwen3vlModelCStruct)]
lib.createQwen3vlCache.argtypes = [POINTER(Qwen3vlModelCStruct)]
lib.createQwen3vlCache.restype = POINTER(Qwen3vlCacheCStruct)
lib.dropQwen3vlCache.argtypes = [
POINTER(Qwen3vlModelCStruct),
POINTER(Qwen3vlCacheCStruct),
]
lib.inferBatchQwen3vl.argtypes = [
POINTER(Qwen3vlModelCStruct),
POINTER(c_uint),
c_uint,
c_void_p, # pixel_values,
c_uint, # total_patches,
POINTER(c_uint), # image_grid_thw,
c_uint, # num_images,
c_void_p, # pixel_values_videos,
c_uint, # total_patches_videos,
POINTER(c_uint), # video_grid_thw,
c_uint, # num_videos,
c_uint, # patch_features,
POINTER(c_uint),
c_uint,
POINTER(c_uint),
POINTER(POINTER(Qwen3vlCacheCStruct)),
POINTER(c_float),
POINTER(c_uint),
POINTER(c_float),
POINTER(c_uint),
]
lib.forwardBatchQwen3vl.argtypes = [
POINTER(Qwen3vlModelCStruct),
POINTER(c_uint),
c_uint,
c_void_p, # pixel_values,
c_uint, # total_patches,
POINTER(c_uint), # image_grid_thw,
c_uint, # num_images,
c_void_p, # pixel_values_videos,
c_uint, # total_patches_videos,
POINTER(c_uint), # video_grid_thw,
c_uint, # num_videos,
c_uint, # patch_features,
POINTER(c_uint),
c_uint,
POINTER(c_uint),
POINTER(POINTER(Qwen3vlCacheCStruct)),
c_void_p,
]
def create_weight_loader(self):
return self.lib.createQwen3vlWeightLoader()
def create_weights(self, meta, device_type, ndev, dev_ids, transpose_weight):
return self.lib.createQwen3vlWeights(
meta, device_type, ndev, dev_ids, transpose_weight
)
def create_model(self, meta, weights):
return self.lib.createQwen3vlModel(meta, weights)
def destroy_model(self, model):
self.lib.destroyQwen3vlModel(model)
def create_cache(self, model):
return self.lib.createQwen3vlCache(model)
def drop_cache(self, model, cache):
self.lib.dropQwen3vlCache(model, cache)
def infer_batch(
self,
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
temperature,
topk,
topp,
output,
):
self.lib.inferBatchQwen3vl(
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
temperature,
topk,
topp,
output,
)
def forward_batch(
self,
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
logits,
):
self.lib.forwardBatchQwen3vl(
model,
tokens,
ntok,
pixel_values,
total_patches,
image_grid_thw,
num_images,
pixel_values_videos,
total_patches_videos,
video_grid_thw,
num_videos,
patch_features,
req_lens,
nreq,
req_pos,
caches,
logits,
)
import ctypes
from typing import List, Sequence
from tqdm import tqdm
from libinfinicore_infer import (
Qwen3vlModel,
Qwen3vlMetaCStruct,
TextMetaCStruct,
VisMetaCStruct,
Qwen3vlWeightsCStruct,
Qwen3vlCacheCStruct,
DataType,
DeviceType,
)
from infer_task import InferTask, KVCache
from ctypes import POINTER, c_float, c_int, c_uint, c_uint16, c_void_p, byref, c_bool
import os
from pathlib import Path
import safetensors
import sys
import time
import json
import math
import torch
import transformers
torch.set_default_device("cpu")
class Qwen3vlLangWeightsNaming:
def input_embd(self):
return "model.language_model.embed_tokens.weight"
def output_embd(self):
return "model.language_model.embed_tokens.weight"
def output_norm(self):
return "model.language_model.norm.weight"
def attn_norm(self, i):
return f"model.language_model.layers.{i}.input_layernorm.weight"
def attn_q_proj(self, i):
return f"model.language_model.layers.{i}.self_attn.q_proj.weight"
def attn_q_norm(self, i):
return f"model.language_model.layers.{i}.self_attn.q_norm.weight"
def attn_k_proj(self, i):
return f"model.language_model.layers.{i}.self_attn.k_proj.weight"
def attn_k_norm(self, i):
return f"model.language_model.layers.{i}.self_attn.k_norm.weight"
def attn_o_proj(self, i):
return f"model.language_model.layers.{i}.self_attn.o_proj.weight"
def attn_v_proj(self, i):
return f"model.language_model.layers.{i}.self_attn.v_proj.weight"
def mlp_norm(self, i):
return f"model.language_model.layers.{i}.post_attention_layernorm.weight"
def mlp_gate(self, i):
return f"model.language_model.layers.{i}.mlp.gate_proj.weight"
def mlp_down(self, i):
return f"model.language_model.layers.{i}.mlp.down_proj.weight"
def mlp_up(self, i):
return f"model.language_model.layers.{i}.mlp.up_proj.weight"
class Qwen3vlVisWeightsNaming:
def patch_embed_weight(self):
return "model.visual.patch_embed.proj.weight"
def patch_embed_bias(self):
return "model.visual.patch_embed.proj.bias"
def pos_embed_weight(self):
return "model.visual.pos_embed.weight"
def attn_proj_weight(self, i):
return f"model.visual.blocks.{i}.attn.proj.weight"
def attn_proj_bias(self, i):
return f"model.visual.blocks.{i}.attn.proj.bias"
def attn_qkv_weight(self, i):
return f"model.visual.blocks.{i}.attn.qkv.weight"
def attn_qkv_bias(self, i):
return f"model.visual.blocks.{i}.attn.qkv.bias"
def mlp_linear_fc1_weight(self, i):
return f"model.visual.blocks.{i}.mlp.linear_fc1.weight"
def mlp_linear_fc1_bias(self, i):
return f"model.visual.blocks.{i}.mlp.linear_fc1.bias"
def mlp_linear_fc2_weight(self, i):
return f"model.visual.blocks.{i}.mlp.linear_fc2.weight"
def mlp_linear_fc2_bias(self, i):
return f"model.visual.blocks.{i}.mlp.linear_fc2.bias"
def norm1_weight(self, i):
return f"model.visual.blocks.{i}.norm1.weight"
def norm1_bias(self, i):
return f"model.visual.blocks.{i}.norm1.bias"
def norm2_weight(self, i):
return f"model.visual.blocks.{i}.norm2.weight"
def norm2_bias(self, i):
return f"model.visual.blocks.{i}.norm2.bias"
def deepstack_merger_linear_fc1_weight(self, i):
return f"model.visual.deepstack_merger_list.{i}.linear_fc1.weight"
def deepstack_merger_linear_fc1_bias(self, i):
return f"model.visual.deepstack_merger_list.{i}.linear_fc1.bias"
def deepstack_merger_linear_fc2_weight(self, i):
return f"model.visual.deepstack_merger_list.{i}.linear_fc2.weight"
def deepstack_merger_linear_fc2_bias(self, i):
return f"model.visual.deepstack_merger_list.{i}.linear_fc2.bias"
def deepstack_merger_norm_weight(self, i):
return f"model.visual.deepstack_merger_list.{i}.norm.weight"
def deepstack_merger_norm_bias(self, i):
return f"model.visual.deepstack_merger_list.{i}.norm.bias"
def merger_linear_fc1_weight(self):
return "model.visual.merger.linear_fc1.weight"
def merger_linear_fc1_bias(self):
return "model.visual.merger.linear_fc1.bias"
def merger_linear_fc2_weight(self):
return "model.visual.merger.linear_fc2.weight"
def merger_linear_fc2_bias(self):
return "model.visual.merger.linear_fc2.bias"
def merger_norm_weight(self):
return "model.visual.merger.norm.weight"
def merger_norm_bias(self):
return "model.visual.merger.norm.bias"
class Qwen3vlMeta(Qwen3vlMetaCStruct):
def __init__(self, config, max_tokens=None):
if config["text_config"]["dtype"] == "float16":
dt_ = DataType.INFINI_DTYPE_F16
self.torch_dtype = torch.float16
elif config["text_config"]["dtype"] == "float32":
dt_ = DataType.INFINI_DTYPE_F32
self.torch_dtype = torch.float32
elif config["text_config"]["dtype"] == "bfloat16":
dt_ = DataType.INFINI_DTYPE_BF16
self.torch_dtype = torch.bfloat16
else:
raise ValueError(
f"Unsupported text dtype: {config['text_config']['dtype']}"
)
super().__init__(
dtype=dt_,
image_token_id=config["image_token_id"],
video_token_id=config["video_token_id"],
vision_end_token_id=config["vision_end_token_id"],
vision_start_token_id=config["vision_start_token_id"],
text_meta=TextMetaCStruct(
bos_token_id=config["text_config"]["bos_token_id"],
eos_token_id=config["text_config"]["eos_token_id"],
head_dim=config["text_config"]["head_dim"],
hidden_size=config["text_config"]["hidden_size"],
initializer_range=config["text_config"]["initializer_range"],
intermediate_size=config["text_config"]["intermediate_size"],
max_tokens=(
config["text_config"]["max_position_embeddings"]
if max_tokens is None
else max_tokens
),
num_attention_heads=config["text_config"]["num_attention_heads"],
num_hidden_layers=config["text_config"]["num_hidden_layers"],
num_key_value_heads=config["text_config"]["num_key_value_heads"],
rms_norm_eps=config["text_config"]["rms_norm_eps"],
mrope_section=(ctypes.c_ulong * 3)(
*config["text_config"]["rope_scaling"]["mrope_section"]
),
rope_theta=config["text_config"]["rope_theta"],
vocab_size=config["text_config"]["vocab_size"],
),
vis_meta=VisMetaCStruct(
depth=config["vision_config"]["depth"],
deepstack_visual_indexes=(ctypes.c_ulong * 3)(
*config["vision_config"]["deepstack_visual_indexes"]
),
hidden_size=config["vision_config"]["hidden_size"],
in_channels=config["vision_config"]["in_channels"],
initializer_range=config["vision_config"]["initializer_range"],
intermediate_size=config["vision_config"]["intermediate_size"],
num_heads=config["vision_config"]["num_heads"],
num_position_embeddings=config["vision_config"][
"num_position_embeddings"
],
out_hidden_size=config["vision_config"]["out_hidden_size"],
patch_size=config["vision_config"]["patch_size"],
spatial_merge_size=config["vision_config"]["spatial_merge_size"],
temporal_patch_size=config["vision_config"]["temporal_patch_size"],
),
)
def load_specific_tensor(model_dir, tensor_name):
"""
Load a specific tensor from a safetensors model.
Supports both sharded models (with index.json) and single file models.
"""
# Try to load from individual .safetensors files
safetensors_files = [f for f in os.listdir(model_dir) if f.endswith(".safetensors")]
if not safetensors_files:
raise FileNotFoundError(f"No .safetensors files found in {model_dir}")
# Try to find the tensor in each file
for filename in safetensors_files:
tensor_file = os.path.join(model_dir, filename)
try:
with safetensors.safe_open(tensor_file, framework="pt", device="cpu") as f:
if tensor_name in f.keys():
tensor = f.get_tensor(tensor_name)
return tensor
except Exception:
continue
# If we reach here, tensor was not found in any file
raise KeyError(f"{tensor_name} not found in any .safetensors files")
def load_Qwen3vl_weights(
meta: Qwen3vlMeta,
weights,
model_path: str,
ndev: int,
):
# torch load weights, and reshape for qkv_proj / mlp_gate_up stack, attn / mlp parallel
# weight loader function load from specific offset according to idev, and transpose
model_instance = Qwen3vlModel()
weight_loader = model_instance.create_weight_loader()
vis_names = Qwen3vlVisWeightsNaming()
lang_names = Qwen3vlLangWeightsNaming()
nkvh = meta.text_meta.num_key_value_heads
nh = meta.text_meta.num_attention_heads
dh = meta.text_meta.head_dim
d = meta.text_meta.hidden_size
di = meta.text_meta.intermediate_size
assert nh % nkvh == 0
assert nh % ndev == 0
assert nkvh % ndev == 0
assert di % ndev == 0
# -------------------------------
# Language_model weights
# -------------------------------
input_embd = load_specific_tensor(model_path, lang_names.input_embd()).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_input_embd(weights, input_embd.data_ptr())
del input_embd
output_norm = load_specific_tensor(model_path, lang_names.output_norm()).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_output_norm(weights, output_norm.data_ptr())
del output_norm
output_embd = load_specific_tensor(model_path, lang_names.output_embd()).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_output_embd(weights, output_embd.data_ptr())
del output_embd
for i in range(meta.text_meta.num_hidden_layers):
attn_norm = load_specific_tensor(model_path, lang_names.attn_norm(i)).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_attn_norm(
weights, attn_norm.data_ptr(), i
)
del attn_norm
attn_q_proj = load_specific_tensor(model_path, lang_names.attn_q_proj(i))
attn_k_proj = load_specific_tensor(model_path, lang_names.attn_k_proj(i))
attn_v_proj = load_specific_tensor(model_path, lang_names.attn_v_proj(i))
_Q = attn_q_proj.reshape(nh, dh, d)
_K = attn_k_proj.reshape(nkvh, dh, d)
_V = attn_v_proj.reshape(nkvh, dh, d)
qkv_proj = []
_nh = nh // ndev
_nkvh = nkvh // ndev
for _idev in range(ndev):
qkv_proj.append(_Q[_idev * _nh : (_idev + 1) * _nh, :, :])
qkv_proj.append(_K[_idev * _nkvh : (_idev + 1) * _nkvh, :, :])
qkv_proj.append(_V[_idev * _nkvh : (_idev + 1) * _nkvh, :, :])
attn_qkv_proj = torch.cat(qkv_proj, dim=0).to(meta.torch_dtype).contiguous()
weight_loader.contents.lang_loader.load_attn_qkv_proj(
weights, attn_qkv_proj.data_ptr(), i
)
del attn_qkv_proj
attn_q_norm = load_specific_tensor(model_path, lang_names.attn_q_norm(i)).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_attn_q_norm(
weights, attn_q_norm.data_ptr(), i
)
del attn_q_norm
attn_k_norm = load_specific_tensor(model_path, lang_names.attn_k_norm(i)).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_attn_k_norm(
weights, attn_k_norm.data_ptr(), i
)
del attn_k_norm
attn_o_proj = load_specific_tensor(model_path, lang_names.attn_o_proj(i))
attn_o_proj = (
attn_o_proj.to(meta.torch_dtype)
.reshape([d, ndev, nh // ndev * dh])
.transpose(0, 1)
.contiguous()
)
weight_loader.contents.lang_loader.load_attn_o_proj(
weights, attn_o_proj.data_ptr(), i
)
del attn_o_proj
mlp_norm = load_specific_tensor(model_path, lang_names.mlp_norm(i)).to(
meta.torch_dtype
)
weight_loader.contents.lang_loader.load_mlp_norm(
weights, mlp_norm.data_ptr(), i
)
del mlp_norm
mlp_gate = load_specific_tensor(model_path, lang_names.mlp_gate(i))
mlp_up = load_specific_tensor(model_path, lang_names.mlp_up(i))
gate_up = []
_di = di // ndev
for _idev in range(ndev):
_start = _idev * _di
_end = (_idev + 1) * _di
gate_up.append(mlp_gate[_start:_end, :])
gate_up.append(mlp_up[_start:_end, :])
mlp_gate_up = torch.cat(gate_up, dim=0).to(meta.torch_dtype).contiguous()
weight_loader.contents.lang_loader.load_mlp_gate_up(
weights, mlp_gate_up.data_ptr(), i
)
del mlp_gate_up
mlp_down = load_specific_tensor(model_path, lang_names.mlp_down(i))
mlp_down = (
mlp_down.to(meta.torch_dtype)
.reshape([d, ndev, di // ndev])
.transpose(0, 1)
.contiguous()
)
weight_loader.contents.lang_loader.load_mlp_down(
weights, mlp_down.data_ptr(), i
)
del mlp_down
# -------------------------------
# Vision head weights
# -------------------------------
patch_embed_weight = load_specific_tensor(
model_path, vis_names.patch_embed_weight()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_patch_embed_weight(
weights, patch_embed_weight.data_ptr()
)
del patch_embed_weight
patch_embed_bias = load_specific_tensor(
model_path, vis_names.patch_embed_bias()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_patch_embed_bias(
weights, patch_embed_bias.data_ptr()
)
del patch_embed_bias
pos_embed_weight = load_specific_tensor(
model_path, vis_names.pos_embed_weight()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_pos_embed_weight(
weights, pos_embed_weight.data_ptr()
)
del pos_embed_weight
for i in range(meta.vis_meta.depth):
attn_proj_weight = load_specific_tensor(
model_path, vis_names.attn_proj_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_attn_proj_weight(
weights, attn_proj_weight.data_ptr(), i
)
del attn_proj_weight
attn_proj_bias = load_specific_tensor(
model_path, vis_names.attn_proj_bias(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_attn_proj_bias(
weights, attn_proj_bias.data_ptr(), i
)
del attn_proj_bias
attn_qkv_weight = load_specific_tensor(
model_path, vis_names.attn_qkv_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_attn_qkv_weight(
weights, attn_qkv_weight.data_ptr(), i
)
del attn_qkv_weight
attn_qkv_bias = load_specific_tensor(model_path, vis_names.attn_qkv_bias(i)).to(
meta.torch_dtype
)
weight_loader.contents.vis_loader.load_attn_qkv_bias(
weights, attn_qkv_bias.data_ptr(), i
)
del attn_qkv_bias
mlp_linear_fc1_weight = load_specific_tensor(
model_path, vis_names.mlp_linear_fc1_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_mlp_linear_fc1_weight(
weights, mlp_linear_fc1_weight.data_ptr(), i
)
del mlp_linear_fc1_weight
mlp_linear_fc1_bias = load_specific_tensor(
model_path, vis_names.mlp_linear_fc1_bias(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_mlp_linear_fc1_bias(
weights, mlp_linear_fc1_bias.data_ptr(), i
)
del mlp_linear_fc1_bias
mlp_linear_fc2_weight = load_specific_tensor(
model_path, vis_names.mlp_linear_fc2_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_mlp_linear_fc2_weight(
weights, mlp_linear_fc2_weight.data_ptr(), i
)
del mlp_linear_fc2_weight
mlp_linear_fc2_bias = load_specific_tensor(
model_path, vis_names.mlp_linear_fc2_bias(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_mlp_linear_fc2_bias(
weights, mlp_linear_fc2_bias.data_ptr(), i
)
del mlp_linear_fc2_bias
norm1_weight = load_specific_tensor(model_path, vis_names.norm1_weight(i)).to(
meta.torch_dtype
)
weight_loader.contents.vis_loader.load_norm1_weight(
weights, norm1_weight.data_ptr(), i
)
del norm1_weight
norm1_bias = load_specific_tensor(model_path, vis_names.norm1_bias(i)).to(
meta.torch_dtype
)
weight_loader.contents.vis_loader.load_norm1_bias(
weights, norm1_bias.data_ptr(), i
)
del norm1_bias
norm2_weight = load_specific_tensor(model_path, vis_names.norm2_weight(i)).to(
meta.torch_dtype
)
weight_loader.contents.vis_loader.load_norm2_weight(
weights, norm2_weight.data_ptr(), i
)
del norm2_weight
norm2_bias = load_specific_tensor(model_path, vis_names.norm2_bias(i)).to(
meta.torch_dtype
)
weight_loader.contents.vis_loader.load_norm2_bias(
weights, norm2_bias.data_ptr(), i
)
del norm2_bias
for i in range(len(meta.vis_meta.deepstack_visual_indexes)):
deepstack_merger_linear_fc1_weight = load_specific_tensor(
model_path, vis_names.deepstack_merger_linear_fc1_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc1_weight(
weights, deepstack_merger_linear_fc1_weight.data_ptr(), i
)
del deepstack_merger_linear_fc1_weight
deepstack_merger_linear_fc1_bias = load_specific_tensor(
model_path, vis_names.deepstack_merger_linear_fc1_bias(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc1_bias(
weights, deepstack_merger_linear_fc1_bias.data_ptr(), i
)
del deepstack_merger_linear_fc1_bias
deepstack_merger_linear_fc2_weight = load_specific_tensor(
model_path, vis_names.deepstack_merger_linear_fc2_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc2_weight(
weights, deepstack_merger_linear_fc2_weight.data_ptr(), i
)
del deepstack_merger_linear_fc2_weight
deepstack_merger_linear_fc2_bias = load_specific_tensor(
model_path, vis_names.deepstack_merger_linear_fc2_bias(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc2_bias(
weights, deepstack_merger_linear_fc2_bias.data_ptr(), i
)
del deepstack_merger_linear_fc2_bias
deepstack_merger_norm_weight = load_specific_tensor(
model_path, vis_names.deepstack_merger_norm_weight(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_deepstack_merger_norm_weight(
weights, deepstack_merger_norm_weight.data_ptr(), i
)
del deepstack_merger_norm_weight
deepstack_merger_norm_bias = load_specific_tensor(
model_path, vis_names.deepstack_merger_norm_bias(i)
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_deepstack_merger_norm_bias(
weights, deepstack_merger_norm_bias.data_ptr(), i
)
del deepstack_merger_norm_bias
merger_linear_fc1_weight = load_specific_tensor(
model_path, vis_names.merger_linear_fc1_weight()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_merger_linear_fc1_weight(
weights, merger_linear_fc1_weight.data_ptr()
)
del merger_linear_fc1_weight
merger_linear_fc1_bias = load_specific_tensor(
model_path, vis_names.merger_linear_fc1_bias()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_merger_linear_fc1_bias(
weights, merger_linear_fc1_bias.data_ptr()
)
del merger_linear_fc1_bias
merger_linear_fc2_weight = load_specific_tensor(
model_path, vis_names.merger_linear_fc2_weight()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_merger_linear_fc2_weight(
weights, merger_linear_fc2_weight.data_ptr()
)
del merger_linear_fc2_weight
merger_linear_fc2_bias = load_specific_tensor(
model_path, vis_names.merger_linear_fc2_bias()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_merger_linear_fc2_bias(
weights, merger_linear_fc2_bias.data_ptr()
)
del merger_linear_fc2_bias
merger_norm_weight = load_specific_tensor(
model_path, vis_names.merger_norm_weight()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_merger_norm_weight(
weights, merger_norm_weight.data_ptr()
)
del merger_norm_weight
merger_norm_bias = load_specific_tensor(
model_path, vis_names.merger_norm_bias()
).to(meta.torch_dtype)
weight_loader.contents.vis_loader.load_merger_norm_bias(
weights, merger_norm_bias.data_ptr()
)
del merger_norm_bias
class Qwen3vlBatchedTask:
def __init__(
self,
tasks: List[InferTask],
all_pixel_values=None,
all_image_grid_thw=None,
all_pixel_values_videos=None,
all_video_grid_thw=None,
):
self.tasks = tasks
self.nreq = len(tasks)
# Precompute fields
token_lists = [t.tokens for t in tasks]
self.req_lens_list = [len(toks) for toks in token_lists]
self.req_pos_list = [t.pos for t in tasks]
self.kv_cache_ptrs = [t.kvcache().data() for t in tasks]
self.temperaturas_list = [t.temperature for t in tasks]
self.topks_list = [t.topk for t in tasks]
self.topps_list = [t.topp for t in tasks]
# Flatten token lists
flat_tokens = [tok for toks in token_lists for tok in toks]
self.ntok = len(flat_tokens)
# Convert to ctypes arrays in one pass
self.tokens = (c_uint * self.ntok)(*flat_tokens)
self.req_lens = (c_uint * self.nreq)(*self.req_lens_list)
self.req_pos = (c_uint * self.nreq)(*self.req_pos_list)
self.kv_caches = (POINTER(Qwen3vlCacheCStruct) * self.nreq)(*self.kv_cache_ptrs)
self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list)
self.topks = (c_uint * self.nreq)(*self.topks_list)
self.topps = (c_float * self.nreq)(*self.topps_list)
# initialize visual encoder inputs
self.pixel_values = None
self.total_patches = 0
self.image_grid_thw = None
self.num_images = 0
self.pixel_values_videos = None
self.total_patches_videos = 0
self.video_grid_thw = None
self.num_videos = 0
self.patch_features = 0
# Prepare visual encoder inputs
# all_pixel_values = [t.inputs['pixel_values'] for t in tasks if 'pixel_values' in t.inputs]
# all_image_grid_thw = [t.inputs['image_grid_thw'] for t in tasks if 'image_grid_thw' in t.inputs]
# all_pixel_values_videos = [t.inputs['pixel_values_videos'] for t in tasks if 'pixel_values_videos' in t.inputs]
# all_video_grid_thw = [t.inputs['video_grid_thw'] for t in tasks if 'video_grid_thw' in t.inputs]
if all_pixel_values is not None:
print(all_pixel_values.shape)
concat_pixel_values = (
torch.cat(all_pixel_values, dim=0)
if isinstance(all_pixel_values, list)
else all_pixel_values
) # (total_patches, features)
self.total_patches = concat_pixel_values.shape[0]
self.patch_features = concat_pixel_values.shape[1]
self.flat_pixels = (
concat_pixel_values.flatten().to(torch.bfloat16).contiguous()
)
self.pixel_values = self.flat_pixels.data_ptr()
if all_image_grid_thw is not None:
concat_grid_thw = (
torch.cat(all_image_grid_thw, dim=0)
if isinstance(all_image_grid_thw, list)
else all_image_grid_thw
) # (total_images, 3)
self.num_images = concat_grid_thw.shape[0]
self.flat_grid = (
concat_grid_thw.flatten().to(torch.int32).contiguous().tolist()
)
self.image_grid_thw = (c_uint * len(self.flat_grid))(*self.flat_grid)
if all_pixel_values_videos is not None:
concat_pixel_values_videos = torch.cat(
all_pixel_values_videos, dim=0
) # (total_patches_videos, features)
self.total_patches_videos = concat_pixel_values_videos.shape[0]
self.patch_features_videos = concat_pixel_values_videos.shape[1]
print(self.patch_features_videos, flush=True)
self.flat_pixels_videos = (
concat_pixel_values_videos.flatten().to(torch.bfloat16).contiguous()
)
self.pixel_values_videos = self.flat_pixels_videos.ctypes.data_as(c_void_p)
if all_video_grid_thw is not None:
concat_grid_thw_videos = torch.cat(
all_video_grid_thw, dim=0
) # (total_videos, 3)
self.num_videos = concat_grid_thw_videos.shape[0]
flat_grid_videos = (
concat_grid_thw_videos.flatten().to(torch.int32).contiguous()
)
self.video_grid_thw = (c_uint * len(flat_grid_videos))(
*flat_grid_videos.tolist()
)
def input_args(self):
return (
self.tokens,
self.ntok,
self.pixel_values,
self.total_patches,
self.image_grid_thw,
self.num_images,
self.pixel_values_videos,
self.total_patches_videos,
self.video_grid_thw,
self.num_videos,
self.patch_features,
self.req_lens,
self.nreq,
self.req_pos,
self.kv_caches,
self.temperaturas,
self.topks,
self.topps,
)
# 需要处理 visual encoder的cache 和 image video输入
class Qwen3vlForCauslLM:
def __init__(
self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None
):
with open(os.path.join(model_dir_path, "config.json"), "r") as f:
config = json.load(f)
self.config = config
eos_token_id = self.config["text_config"]["eos_token_id"]
self.eos_token_id = (
[eos_token_id] if type(eos_token_id) == int else eos_token_id
)
print(model_dir_path)
if "qwen3_vl" == config["model_type"]:
self.meta = Qwen3vlMeta(config, max_tokens=max_tokens)
self.processor = transformers.AutoProcessor.from_pretrained(model_dir_path)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path)
else:
raise ValueError("Unsupported model architecture")
print(f"Creating model on {ndev} devices...")
load_start_time = time.time()
dev_ids = (c_int * ndev)(*[i for i in range(ndev)])
self.model_instance = Qwen3vlModel()
weights = self.model_instance.create_weights(
byref(self.meta), device, ndev, dev_ids, c_bool(True)
)
print("Loading weights...")
# Load weights from host
load_Qwen3vl_weights(self.meta, weights, model_dir_path, ndev)
# Create model instance
self.model_ptr = self.model_instance.create_model(
byref(self.meta),
weights,
)
load_end_time = time.time()
print(f"Time used: {load_end_time - load_start_time:.3f}s")
def max_context_len(self):
return self.meta.text_meta.max_tokens
def create_kv_cache(self):
return self.model_instance.create_cache(self.model_ptr)
def drop_kv_cache(self, kv_cache):
self.model_instance.drop_cache(self.model_ptr, kv_cache)
def batch_infer_one_round(
self,
tasks: List[InferTask],
all_pixel_values=None,
all_image_grid_thw=None,
all_pixel_values_videos=None,
all_video_grid_thw=None,
):
output = (c_uint * len(tasks))()
batch_inputs = Qwen3vlBatchedTask(
tasks,
all_pixel_values,
all_image_grid_thw,
all_pixel_values_videos,
all_video_grid_thw,
)
self.model_instance.infer_batch(
self.model_ptr,
*(batch_inputs.input_args()),
output,
)
return list(output)
def generate(
self, input_content, max_steps=0, topp_=1.0, topk_=1, temperature_=1.0
):
inputs = self.processor.apply_chat_template(
conversation=[{"role": "user", "content": input_content}],
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
)
tokens = inputs["input_ids"][0].tolist()
pixel_values = inputs["pixel_values"] if "pixel_values" in inputs else None
image_grid_thw = (
inputs["image_grid_thw"] if "image_grid_thw" in inputs else None
)
pixel_values_videos = (
inputs["pixel_values_videos"] if "pixel_values_videos" in inputs else None
)
video_grid_thw = (
inputs["video_grid_thw"] if "video_grid_thw" in inputs else None
)
infer_task = InferTask(
0,
tokens,
self.max_context_len(),
temperature_,
topk_,
topp_,
self.eos_token_id,
)
infer_task.bind_kvcache(KVCache(self))
print(input_content)
steps = 0
total_time = 0
output_content = ""
# print(inputs['input_ids'][0].tolist(), flush=True)
for step_i in range(max_steps if max_steps > 0 else self.max_context_len()):
start_time = time.time()
output_tokens = self.batch_infer_one_round(
[infer_task],
pixel_values,
image_grid_thw,
pixel_values_videos,
video_grid_thw,
)
# print(output_tokens)
end_time = time.time()
steps += 1
output_str = self.tokenizer.decode(output_tokens[0])
output_content += output_str
print(output_str, end="", flush=True)
pixel_values = None
image_grid_thw = None
pixel_values_videos = None
video_grid_thw = None
if output_tokens[0] in self.eos_token_id:
break
infer_task.next(output_tokens[0])
if step_i > 0:
total_time += end_time - start_time
print("\n")
avg_time = total_time * 1000 / steps if steps > 0 else -1
# print(output_content, flush=True)
print(f"Time per step: {avg_time:.3f}ms")
infer_task._kv_cache.drop(self)
return output_content, avg_time
def destroy_model_instance(self):
self.model_instance.destroy_model(self.model_ptr)
print("Model destroyed")
def test():
if len(sys.argv) < 3:
print(
"Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] <path/to/model_dir> [n_device]"
)
sys.exit(1)
model_path = sys.argv[2]
device_type = DeviceType.DEVICE_TYPE_CPU
if sys.argv[1] == "--cpu":
device_type = DeviceType.DEVICE_TYPE_CPU
elif sys.argv[1] == "--nvidia":
device_type = DeviceType.DEVICE_TYPE_NVIDIA
elif sys.argv[1] == "--cambricon":
device_type = DeviceType.DEVICE_TYPE_CAMBRICON
elif sys.argv[1] == "--ascend":
device_type = DeviceType.DEVICE_TYPE_ASCEND
elif sys.argv[1] == "--metax":
device_type = DeviceType.DEVICE_TYPE_METAX
elif sys.argv[1] == "--moore":
device_type = DeviceType.DEVICE_TYPE_MOORE
elif sys.argv[1] == "--iluvatar":
device_type = DeviceType.DEVICE_TYPE_ILUVATAR
else:
print(
"Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] <path/to/model_dir> [n_device]"
)
sys.exit(1)
ndev = int(sys.argv[3]) if len(sys.argv) > 3 else 1
img_url = None
if len(sys.argv) > 4:
img_url = sys.argv[4]
model = Qwen3vlForCauslLM(model_path, device_type, ndev, max_tokens=1024)
input_content = (
[
{"type": "text", "text": "Describe this image."},
{"type": "image", "url": img_url},
]
if img_url is not None
else [{"type": "text", "text": "山东最高的山是?"}]
)
model.generate(input_content)
model.destroy_model_instance()
if __name__ == "__main__":
test()
...@@ -16,7 +16,7 @@ public: ...@@ -16,7 +16,7 @@ public:
class MemoryPool : public AllocatorBase { class MemoryPool : public AllocatorBase {
public: public:
static constexpr size_t DEFAULT_ALIGNMENT = 256; static constexpr size_t DEFAULT_ALIGNMENT = 512;
explicit MemoryPool(size_t initialSize = 0, size_t alignment = DEFAULT_ALIGNMENT); explicit MemoryPool(size_t initialSize = 0, size_t alignment = DEFAULT_ALIGNMENT);
~MemoryPool(); ~MemoryPool();
......
...@@ -153,6 +153,8 @@ public: ...@@ -153,6 +153,8 @@ public:
class CacheManager { class CacheManager {
public: public:
DECLARE_OP_CACHE(Add) DECLARE_OP_CACHE(Add)
DECLARE_OP_CACHE(Conv)
DECLARE_OP_CACHE(Mul)
DECLARE_OP_CACHE(RMSNorm) DECLARE_OP_CACHE(RMSNorm)
DECLARE_OP_CACHE(Gemm) DECLARE_OP_CACHE(Gemm)
DECLARE_OP_CACHE(RoPE) DECLARE_OP_CACHE(RoPE)
...@@ -160,11 +162,14 @@ public: ...@@ -160,11 +162,14 @@ public:
DECLARE_OP_CACHE(CausalSoftmax) DECLARE_OP_CACHE(CausalSoftmax)
DECLARE_OP_CACHE(Topkrouter) DECLARE_OP_CACHE(Topkrouter)
DECLARE_OP_CACHE(SwiGLU) DECLARE_OP_CACHE(SwiGLU)
DECLARE_OP_CACHE(Silu)
DECLARE_OP_CACHE(RandomSample) DECLARE_OP_CACHE(RandomSample)
DECLARE_OP_CACHE(DequantizeAWQ) DECLARE_OP_CACHE(DequantizeAWQ)
CacheManager(size_t capacity = 100) CacheManager(size_t capacity = 100)
: Add_cache(capacity, DESTROY_FUNC(Add)), : Add_cache(capacity, DESTROY_FUNC(Add)),
Conv_cache(capacity, DESTROY_FUNC(Conv)),
Mul_cache(capacity, DESTROY_FUNC(Mul)),
RMSNorm_cache(capacity, DESTROY_FUNC(RMSNorm)), RMSNorm_cache(capacity, DESTROY_FUNC(RMSNorm)),
Gemm_cache(capacity, DESTROY_FUNC(Gemm)), Gemm_cache(capacity, DESTROY_FUNC(Gemm)),
RoPE_cache(capacity, DESTROY_FUNC(RoPE)), RoPE_cache(capacity, DESTROY_FUNC(RoPE)),
...@@ -172,6 +177,7 @@ public: ...@@ -172,6 +177,7 @@ public:
CausalSoftmax_cache(capacity, DESTROY_FUNC(CausalSoftmax)), CausalSoftmax_cache(capacity, DESTROY_FUNC(CausalSoftmax)),
Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)), Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)),
SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)), SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)),
Silu_cache(capacity, DESTROY_FUNC(Silu)),
RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)), RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)),
DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {} DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {}
......
...@@ -33,6 +33,61 @@ void InferenceContext::add(std::shared_ptr<Tensor> c, ...@@ -33,6 +33,61 @@ void InferenceContext::add(std::shared_ptr<Tensor> c,
c->data(), a->data(), b->data(), stream)); c->data(), a->data(), b->data(), stream));
} }
void InferenceContext::conv(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w,
std::shared_ptr<Tensor> bias,
void *pads,
void *strides,
void *dilations,
size_t n) {
size_t key = CacheManager::createDescriptorKey(y, x, w, bias);
// Combine additional parameters into the key for unique identification
hash_combine(key, std::hash<void *>()(pads));
hash_combine(key, std::hash<void *>()(strides));
hash_combine(key, std::hash<void *>()(dilations));
hash_combine(key, std::hash<size_t>()(n));
infiniopConvDescriptor_t desc;
if (!cache_manager->getConvDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateConvDescriptor(
op_handle, &desc, y->desc(), x->desc(), w->desc(),
bias ? bias->desc() : nullptr, pads, strides, dilations, n));
cache_manager->putConvDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetConvWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopConv(
desc, workspace, workspace_size,
y->data(), x->data(), w->data(),
bias ? bias->data() : nullptr, stream));
}
void InferenceContext::mul(std::shared_ptr<Tensor> c,
std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> b) {
size_t key = CacheManager::createDescriptorKey(c, a, b);
infiniopMulDescriptor_t desc;
if (!cache_manager->getMulDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateMulDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
cache_manager->putMulDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetMulWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopMul(
desc, workspace, workspace_size,
c->data(), a->data(), b->data(), stream));
}
void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y, void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w, std::shared_ptr<Tensor> w,
...@@ -189,6 +244,26 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out, ...@@ -189,6 +244,26 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
out->data(), up->data(), gate->data(), stream)); out->data(), up->data(), gate->data(), stream));
} }
void InferenceContext::silu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> input) {
size_t key = CacheManager::createDescriptorKey(out, input);
infiniopSiluDescriptor_t desc;
if (!cache_manager->getSiluDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateSiluDescriptor(
op_handle, &desc, out->desc(), input->desc()));
cache_manager->putSiluDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetSiluWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopSilu(desc, workspace, workspace_size,
out->data(), input->data(), stream));
}
void InferenceContext::randomSample(std::shared_ptr<Tensor> out, void InferenceContext::randomSample(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> prob, std::shared_ptr<Tensor> prob,
float random_val, float top_p, uint32_t top_k, float temperature) { float random_val, float top_p, uint32_t top_k, float temperature) {
......
...@@ -19,6 +19,14 @@ struct InferenceContext { ...@@ -19,6 +19,14 @@ struct InferenceContext {
void add(std::shared_ptr<Tensor> c, void add(std::shared_ptr<Tensor> c,
std::shared_ptr<Tensor> a, std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> b); std::shared_ptr<Tensor> b);
void conv(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w,
std::shared_ptr<Tensor> bias,
void *pads, void *strides, void *dilations, size_t n);
void mul(std::shared_ptr<Tensor> c,
std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> b);
void rmsnorm(std::shared_ptr<Tensor> y, void rmsnorm(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w, std::shared_ptr<Tensor> w,
...@@ -48,6 +56,8 @@ struct InferenceContext { ...@@ -48,6 +56,8 @@ struct InferenceContext {
void swiglu(std::shared_ptr<Tensor> out, void swiglu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> up, std::shared_ptr<Tensor> up,
std::shared_ptr<Tensor> gate); std::shared_ptr<Tensor> gate);
void silu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> input);
void randomSample(std::shared_ptr<Tensor> out, void randomSample(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> prob, std::shared_ptr<Tensor> prob,
float random_val, float top_p, uint32_t top_k, float temperature); float random_val, float top_p, uint32_t top_k, float temperature);
...@@ -81,6 +91,15 @@ inline void add(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::share ...@@ -81,6 +91,15 @@ inline void add(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::share
getInferenceContext().add(c, a, b); getInferenceContext().add(c, a, b);
} }
inline void conv(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, std::shared_ptr<Tensor> w, std::shared_ptr<Tensor> bias,
void *pads, void *strides, void *dilations, size_t n) {
getInferenceContext().conv(y, x, w, bias, pads, strides, dilations, n);
}
inline void mul(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::shared_ptr<Tensor> b) {
getInferenceContext().mul(c, a, b);
}
inline void rmsnorm(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, inline void rmsnorm(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w, float epsilon) { std::shared_ptr<Tensor> w, float epsilon) {
getInferenceContext().rmsnorm(y, x, w, epsilon); getInferenceContext().rmsnorm(y, x, w, epsilon);
...@@ -131,6 +150,10 @@ inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up, ...@@ -131,6 +150,10 @@ inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up,
getInferenceContext().swiglu(out, up, gate); getInferenceContext().swiglu(out, up, gate);
} }
inline void silu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> input) {
getInferenceContext().silu(out, input);
}
inline void randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> prob, inline void randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> prob,
float random_val, float top_p, uint32_t top_k, float temperature) { float random_val, float top_p, uint32_t top_k, float temperature) {
getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature); getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature);
......
#include "qwen3vl_impl.hpp"
#include "../../tensor.hpp"
#include "../../utils.hpp"
#include "../inference_context.hpp"
#include "infinicore_infer.h"
#include <random>
#include <thread>
#include <vector>
void createDeviceResource(Qwen3vlDeviceResource *rsrc, const Qwen3vlMeta *meta,
std::shared_ptr<Qwen3vlDeviceWeights> weights,
infiniDevice_t device, int idev,
int ndev, int dev_id,
infinicclComm_t comm) {
RUN_INFINI(infinirtSetDevice(device, dev_id));
RUN_INFINI(infinirtStreamSynchronize(weights->load_stream));
infiniopHandle_t handle;
infiniopCreateHandle(&handle);
infinirtStream_t stream;
infinirtStreamCreate(&stream);
auto memory_pool = std::make_shared<MemoryPool>();
*rsrc = Qwen3vlDeviceResource{
device,
dev_id,
handle,
weights,
stream,
comm,
memory_pool,
};
RUN_INFINI(infinirtDeviceSynchronize());
}
void releaseDeviceResource(Qwen3vlDeviceResource &res) {
infinirtDeviceSynchronize();
res.weights.reset();
infiniopDestroyHandle(res.handle);
res.handle = nullptr;
infinirtStreamDestroy(res.stream);
res.stream = nullptr;
infinicclCommDestroy(res.comm);
res.comm = nullptr;
}
inline std::shared_ptr<Tensor> get_custom_SinTable(const Qwen3vlMeta &meta, std::vector<std::vector<uint32_t>> &pos_ids, uint32_t dim, size_t theta) {
// pos_ids shape:[seq, dim/2] , pos ids acting on each dim
auto unit = dsize(meta.dtype);
auto half_dim = dim / 2;
size_t len = pos_ids.size();
void *table = std::malloc(len * half_dim * unit);
for (size_t i = 0; i < len; i++) {
for (size_t j = 0; j < half_dim; j++) {
float _cos = std::sin(
static_cast<float>(pos_ids[i][j]) / std::pow(theta, static_cast<float>(j) / half_dim));
if (meta.dtype == INFINI_DTYPE_F16) {
((uint16_t *)table)[i * half_dim + j] = f32_to_f16(_cos);
} else if (meta.dtype == INFINI_DTYPE_BF16) {
((uint16_t *)table)[i * half_dim + j] = f32_to_bf16(_cos);
} else if (meta.dtype == INFINI_DTYPE_F32) {
((float *)table)[i * half_dim + j] = _cos;
} else {
std::cout << "unsupported data type" << std::endl;
exit(1);
}
}
}
auto shape = std::vector<size_t>({len, half_dim});
auto tensor = Tensor::weight(table, meta.dtype, shape);
std::free(table);
return tensor;
}
inline std::shared_ptr<Tensor> get_custom_CosTable(const Qwen3vlMeta &meta, std::vector<std::vector<uint32_t>> &pos_ids, uint32_t dim, size_t theta) {
// pos_ids shape:[seq, dim/2] , pos ids acting on each dim
auto unit = dsize(meta.dtype);
auto half_dim = dim / 2;
size_t len = pos_ids.size();
void *table = std::malloc(len * half_dim * unit);
for (size_t i = 0; i < len; i++) {
for (size_t j = 0; j < half_dim; j++) {
float _cos = std::cos(
static_cast<float>(pos_ids[i][j]) / std::pow(theta, static_cast<float>(j) / half_dim));
if (meta.dtype == INFINI_DTYPE_F16) {
((uint16_t *)table)[i * half_dim + j] = f32_to_f16(_cos);
} else if (meta.dtype == INFINI_DTYPE_BF16) {
((uint16_t *)table)[i * half_dim + j] = f32_to_bf16(_cos);
} else if (meta.dtype == INFINI_DTYPE_F32) {
((float *)table)[i * half_dim + j] = _cos;
} else {
std::cout << "unsupported data type" << std::endl;
exit(1);
}
}
}
auto shape = std::vector<size_t>({len, half_dim});
auto tensor = Tensor::weight(table, meta.dtype, shape);
std::free(table);
return tensor;
}
inline std::shared_ptr<Tensor> fast_pos_embed_interpolate(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc,
uint32_t *grid_thw, uint32_t num_batch, uint32_t total_patches) {
auto dtype = meta.dtype;
auto num_position_embeddings = meta.vis_meta.num_position_embeddings;
auto hidden_size = meta.vis_meta.hidden_size;
auto merge_size = meta.vis_meta.spatial_merge_size;
auto num_grid_per_side = static_cast<uint32_t>(sqrt(num_position_embeddings));
uint32_t total_pixels_offset = 0;
std::shared_ptr<Tensor> patch_pos_embeds = Tensor::buffer(dtype, {total_patches, hidden_size}, rsrc.memory_pool);
auto pos_embed_weight = rsrc.weights->w_vis->pos_embed_weight;
std::vector<std::shared_ptr<Tensor>> pos_embeds(4);
for (uint32_t i = 0; i < num_batch; ++i) {
uint32_t t = grid_thw[i * 3];
uint32_t h = grid_thw[i * 3 + 1];
uint32_t w = grid_thw[i * 3 + 2];
auto weight_array = std::vector<uint16_t>(h * w * hidden_size);
auto weight_tensor = Tensor::buffer(dtype, {h * w, hidden_size}, rsrc.memory_pool);
// 计算插值索引和权重
std::vector<std::vector<uint32_t>> indices(4);
std::vector<std::vector<float>> weights(4);
auto linspace = [](float start, float end, uint32_t num_points) -> std::vector<float> {
std::vector<float> res(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
res[i] = start + (end - start) * i / (num_points - 1);
}
return res;
};
auto h_idxs = linspace(0, num_grid_per_side - 1, h);
auto w_idxs = linspace(0, num_grid_per_side - 1, w);
for (uint32_t ih = 0; ih < h; ++ih) {
for (uint32_t iw = 0; iw < w; ++iw) {
float h_idx_f = h_idxs[ih], w_idx_f = w_idxs[iw];
uint32_t h_idx_floor = static_cast<uint32_t>(floor(h_idx_f)),
w_idx_floor = static_cast<uint32_t>(floor(w_idx_f));
uint32_t h_idx_ceil = std::min(static_cast<uint32_t>(ceil(h_idx_f)), num_grid_per_side - 1),
w_idx_ceil = std::min(static_cast<uint32_t>(ceil(w_idx_f)), num_grid_per_side - 1);
float dh = h_idx_f - h_idx_floor, dw = w_idx_f - w_idx_floor;
indices[0].push_back((h_idx_floor * num_grid_per_side) + w_idx_floor);
indices[1].push_back((h_idx_floor * num_grid_per_side) + w_idx_ceil);
indices[2].push_back((h_idx_ceil * num_grid_per_side) + w_idx_floor);
indices[3].push_back((h_idx_ceil * num_grid_per_side) + w_idx_ceil);
weights[0].push_back((1 - dh) * (1 - dw));
weights[1].push_back((1 - dh) * dw);
weights[2].push_back(dh * (1 - dw));
weights[3].push_back(dh * dw);
}
}
// 查表并加权求和
for (int j = 0; j < 4; ++j) {
pos_embeds[j] = Tensor::buffer(dtype, {h * w, hidden_size}, rsrc.memory_pool);
// 使用索引和权重获取对应位置嵌入,并乘以权重
for (size_t i = 0; i < h * w; i++) {
rearrange(pos_embeds[j]->slice(0, i, 1), pos_embed_weight->slice(0, indices[j][i], 1));
}
for (size_t i = 0; i < h * w; i++) {
uint16_t w_value = f32_to_bf16(weights[j][i]);
for (size_t k = 0; k < hidden_size; k++) {
weight_array[i * hidden_size + k] = w_value;
}
}
RUN_INFINI(infinirtMemcpyAsync(weight_tensor->data(), weight_array.data(), sizeof(uint16_t) * h * w * hidden_size,
INFINIRT_MEMCPY_H2D, rsrc.stream));
mul(pos_embeds[j], pos_embeds[j], weight_tensor);
}
// 合并四个方向的结果
auto patch_pos_embed = pos_embeds[0]; // [h*w, hidden_size]
for (int j = 1; j < 4; ++j) {
add(patch_pos_embed, patch_pos_embed, pos_embeds[j]);
}
// 对于视频帧数T>1的情况,重复patch_pos_embed T次
if (t > 1) {
auto temp_patch_pos_embed = Tensor::buffer(dtype, {t, h * w, hidden_size}, rsrc.memory_pool);
for (size_t i = 0; i < t; i++) {
rearrange(temp_patch_pos_embed->slice(0, i, 1), patch_pos_embed);
}
patch_pos_embed = temp_patch_pos_embed;
}
printf("merge patch pos embed/n");
fflush(stdout);
patch_pos_embed = patch_pos_embed
->view({t, h / merge_size, merge_size, w / merge_size, merge_size, hidden_size})
->permute({0, 1, 3, 2, 4, 5})
->view({t * h * w, hidden_size}); // 可能因为内存不连续无法再view
rearrange(patch_pos_embeds->slice(0, total_pixels_offset, t * h * w), patch_pos_embed);
total_pixels_offset += t * h * w;
}
return patch_pos_embeds;
}
inline auto rot_pos_embed(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc, uint32_t *grid_thw, uint32_t num_batch, uint32_t total_patches) {
auto dtype = meta.dtype;
auto hidden_size = meta.vis_meta.hidden_size;
auto num_heads = meta.vis_meta.num_heads;
auto head_dim = hidden_size / num_heads;
auto merge_size = meta.vis_meta.spatial_merge_size;
std::vector<std::vector<uint32_t>> pos_ids_table_y(
total_patches,
std::vector<uint32_t>(head_dim / 4));
std::vector<std::vector<uint32_t>> pos_ids_table_x(
total_patches,
std::vector<uint32_t>(head_dim / 4));
for (uint32_t b = 0; b < num_batch; ++b) {
uint32_t offset = b * 3;
uint32_t num_frames = grid_thw[offset + 0];
uint32_t height = grid_thw[offset + 1];
uint32_t width = grid_thw[offset + 2];
uint32_t merged_h = height / merge_size;
uint32_t merged_w = width / merge_size;
// 遍历所有块和块内位置
size_t patch_offset = 0;
for (uint32_t bh = 0; bh < merged_h; ++bh) {
for (uint32_t bw = 0; bw < merged_w; ++bw) {
for (uint32_t ih = 0; ih < merge_size; ++ih) {
for (uint32_t iw = 0; iw < merge_size; ++iw) {
uint32_t row = bh * merge_size + ih;
uint32_t col = bw * merge_size + iw;
// 如果是多帧,重复 num_frames 次
for (uint32_t f = 0; f < num_frames; ++f) {
size_t dim_offset = 0;
for (; dim_offset < head_dim / 4; dim_offset++) {
pos_ids_table_y[patch_offset][dim_offset] = row;
pos_ids_table_x[patch_offset][dim_offset] = col;
}
patch_offset++;
}
}
}
}
}
}
auto sin = Tensor::buffer(dtype, {total_patches, head_dim / 2}, rsrc.memory_pool);
auto sin_y = get_custom_SinTable(meta, pos_ids_table_y, head_dim / 2, 10000);
rearrange(sin->slice(1, 0, head_dim / 4), sin_y);
auto sin_x = get_custom_SinTable(meta, pos_ids_table_x, head_dim / 2, 10000);
rearrange(sin->slice(1, head_dim / 4, head_dim / 2), sin_y);
auto cos = Tensor::buffer(dtype, {total_patches, head_dim / 2}, rsrc.memory_pool);
auto cos_y = get_custom_CosTable(meta, pos_ids_table_y, head_dim / 2, 10000);
rearrange(cos->slice(1, 0, head_dim / 4), cos_y);
auto cos_x = get_custom_CosTable(meta, pos_ids_table_x, head_dim / 2, 10000);
rearrange(cos->slice(1, head_dim / 4, head_dim / 2), cos_y);
return std::pair{sin, cos};
}
void inferDeviceBatchVision(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc,
uint32_t idev, uint32_t ndev, InferRequest &req) {
void *pixel_values = req.pixel_values;
uint32_t total_patches = req.total_patches;
uint32_t *image_grid_thw = req.image_grid_thw;
uint32_t num_images = req.num_images;
void *pixel_values_videos = req.pixel_values_videos;
uint32_t total_patches_videos = req.total_patches_videos;
// uint32_t *video_grid_thw = req.video_grid_thw;
// uint32_t num_videos = req.num_videos;
// uint32_t patch_features = req.patch_features;
auto dtype = meta.dtype;
auto d = meta.vis_meta.hidden_size;
auto channels = meta.vis_meta.in_channels;
auto patch_size = meta.vis_meta.patch_size;
auto temporal_patch_size = meta.vis_meta.temporal_patch_size;
// auto stream = rsrc.stream;
auto weights = rsrc.weights;
auto image_tensor = Tensor::weight(pixel_values, dtype, {total_patches, channels * temporal_patch_size * patch_size * patch_size});
auto video_tensor = Tensor::weight(pixel_values_videos, dtype, {total_patches_videos, channels * temporal_patch_size * patch_size * patch_size});
auto hidden_states = Tensor::buffer(dtype, {total_patches, d, 1, 1, 1}, rsrc.memory_pool);
std::vector<size_t> pads = {0, 0, 0};
std::vector<ptrdiff_t> strides = {static_cast<long>(temporal_patch_size), static_cast<long>(patch_size), static_cast<long>(patch_size)};
std::vector<size_t> dilations = {1, 1, 1};
conv(hidden_states, image_tensor, rsrc.weights->w_vis->patch_embed_weight, rsrc.weights->w_vis->patch_embed_bias,
pads.data(), strides.data(), dilations.data(), 3);
hidden_states = hidden_states->view({total_patches, d});
auto pos_embeds = fast_pos_embed_interpolate(meta, rsrc, image_grid_thw, num_images, total_patches);
add(hidden_states, hidden_states, pos_embeds);
auto [sin, cos] = rot_pos_embed(meta, rsrc, image_grid_thw, num_images, total_patches);
}
void inferDeviceBatchText(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc,
uint32_t idev, uint32_t ndev, InferRequest &req) {
const uint32_t *tokens = req.tokens;
uint32_t ntok = req.ntok;
const uint32_t *req_lens = req.req_lens;
uint32_t nreq = req.nreq;
const uint32_t *req_pos = req.req_pos;
struct Qwen3vlCache **caches = req.kv_caches;
const float *temperature = req.temperature;
const uint32_t *topk = req.topk;
const float *topp = req.topp;
uint32_t *output = req.output;
void *last_logits = req.logits;
assert(meta.text_meta.num_attention_heads % ndev == 0);
assert(meta.text_meta.num_key_value_heads % ndev == 0);
auto dtype = meta.dtype;
auto nlayer = meta.text_meta.num_hidden_layers;
size_t nh = meta.text_meta.num_attention_heads / size_t(ndev);
size_t nkvh = meta.text_meta.num_key_value_heads / size_t(ndev);
auto ngroup = nh / nkvh;
auto dh = meta.text_meta.head_dim;
auto d = meta.text_meta.hidden_size;
auto di = meta.text_meta.intermediate_size / size_t(ndev);
auto dvoc = meta.text_meta.vocab_size;
float epsilon = meta.text_meta.rms_norm_eps;
auto stream = rsrc.stream;
auto weights = rsrc.weights;
// Allocate buffers
auto logits_in = Tensor::buffer(dtype, {ntok, d}, rsrc.memory_pool);
auto logits_out = Tensor::buffer(dtype, {ntok, d}, rsrc.memory_pool);
// 所有请求的当前token
auto qkv_buf = Tensor::buffer(dtype, {ntok, (nh + nkvh * 2) * dh}, rsrc.memory_pool);
auto o_buf = Tensor::buffer(dtype, {ntok, nh * dh}, rsrc.memory_pool);
auto gate_up_buf = Tensor::buffer(dtype, {ntok, 2 * di}, rsrc.memory_pool);
auto prob_buf = Tensor::buffer(dtype, {nreq, dvoc}, rsrc.memory_pool);
auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
auto result_cpu = std::vector<int64_t>(nreq);
auto qkv_rope = qkv_buf->view({ntok, nh + nkvh * 2, dh});
auto q_buf = qkv_rope->slice(1, 0, nh);
auto k_buf = qkv_rope->slice(1, nh, nkvh);
// Prepare inputs
auto batch_pos_ids = std::vector<uint32_t>(ntok);
size_t req_start = 0;
for (uint32_t req = 0; req < nreq; req++) {
for (uint32_t i = 0; i < req_lens[req]; i++) { // req_len 本次query长度,req_pos 历史长度
batch_pos_ids[req_start + i] = req_pos[req] + i; // batch_pos_ids 展平后每个token的pos
}
req_start += req_lens[req];
}
std::shared_ptr<Tensor> pos_ids_buf;
if (rsrc.device == INFINI_DEVICE_CPU) {
pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
} else {
pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
INFINIRT_MEMCPY_H2D, stream));
}
// convert tokens to embeddings
for (uint32_t i = 0; i < ntok; i++) {
RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
weights->w_lang->in_embd->data(tokens[i] * d),
dsize(dtype) * d, INFINIRT_MEMCPY_D2D, stream));
}
// attention inner
size_t max_qk_size = 0;
size_t max_seq_len = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto past_len = req_pos[req];
auto seq_len = req_lens[req];
auto total_len = past_len + seq_len;
max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
max_seq_len = std::max(max_seq_len, size_t(seq_len));
}
auto qk_buf = Tensor::buffer(dtype, {nh * max_qk_size}, rsrc.memory_pool);
auto rearrange_q_buf = Tensor::buffer(dtype, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
auto attn_val_buf = Tensor::buffer(dtype, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
auto gate_buf = gate_up_buf->slice(1, 0, di);
auto up_buf = gate_up_buf->slice(1, di, di);
// Compute
for (uint32_t i = 0; i < nlayer; i++) {
// attn norm
rmsnorm(logits_out, logits_in, weights->w_lang->layers[i].attn_norm, epsilon);
// qkv_proj
linear(qkv_buf, logits_out, weights->w_lang->layers[i].attn_qkv_proj, 1.0, 0.0, nullptr, nullptr);
// qk_norm
rmsnorm(q_buf, q_buf, weights->w_lang->layers[i].attn_q_norm, epsilon);
rmsnorm(k_buf, k_buf, weights->w_lang->layers[i].attn_k_norm, epsilon);
// rope
rope_v2(q_buf, q_buf, pos_ids_buf, weights->sin_table, weights->cos_table);
rope_v2(k_buf, k_buf, pos_ids_buf, weights->sin_table, weights->cos_table);
// 逐个req处理
size_t token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto past_len = req_pos[req];
auto seq_len = req_lens[req];
auto total_len = past_len + seq_len;
auto o = o_buf->slice(0, token_offset, seq_len)->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3}); // [nkvh, ngroup, seq_len, dh]
auto q = qkv_rope->slice({{0, token_offset, seq_len}, {1, 0, nh}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3}); // [nkvh, ngroup, seq_len, dh]
auto k = qkv_rope->slice({{0, token_offset, seq_len}, {1, nh, nkvh}}); // [ntok, nkvh, dh]
auto v = qkv_rope->slice({{0, token_offset, seq_len}, {1, nh + nkvh, nkvh}}); // [ntok, nkvh, dh]
// concat to cache
rearrange(caches[req]->k_rot[idev][i]->slice(0, past_len, seq_len), k);
rearrange(caches[req]->v[idev][i]->slice(0, past_len, seq_len), v);
// fill full_k full_v
auto full_k_buff = caches[req]->k_rot[idev][i]->slice(0, 0, total_len)->permute({1, 2, 0}); // [nkvh, dh, total_len]
auto full_v_buff = caches[req]->v[idev][i]->slice(0, 0, total_len)->permute({1, 0, 2}); // [nkvh, total_len, dh]
// self-attn
rearrange(q_rearrange->slice(2, 0, seq_len), q);
auto attn_score_req = qk_buf->slice(0, 0, nh * seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len});
// [nkvh, ngroup * seq_len, dh] @ [nkvh, dh, total_len] = [nkvh, ngroup * seq_len, total_len]
linear(attn_score_req, rearrange_q_buf->slice(1, 0, ngroup * seq_len), full_k_buff, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr);
// softmax
auto qk_softmax = attn_score_req->view({nh, seq_len, total_len});
causalSoftmax(qk_softmax, qk_softmax);
// [nkvh, ngroup * seq_len, total_len] @ [nkvh, total_len, dh] = [nkvh, ngroup * seq_len, dh]
linear(attn_val_buf->slice(1, 0, ngroup * seq_len), attn_score_req, full_v_buff, 1.0, 0.0, nullptr, nullptr);
// printf("rearrage o; layer[%d]\n",i);
rearrange(o, attn_val_gemm->slice(2, 0, seq_len));
token_offset += seq_len;
}
linear(logits_in, o_buf, weights->w_lang->layers[i].attn_o_proj, 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr);
// All_reduce if distributed
if (rsrc.comm != nullptr) {
RUN_INFINI(infinicclAllReduce(
logits_in->data(), logits_in->data(), ntok * d, dtype,
INFINICCL_SUM, rsrc.comm, stream));
RUN_INFINI(infinirtStreamSynchronize(stream));
}
// mlp norm
rmsnorm(logits_out, logits_in, weights->w_lang->layers[i].mlp_norm, epsilon);
// mlp gate_up
linear(gate_up_buf, logits_out, weights->w_lang->layers[i].mlp_gate_up, 1.0, 0.0, nullptr, nullptr);
// silu
silu(gate_buf, gate_buf);
mul(gate_buf, gate_buf, up_buf);
// mlp down
linear(logits_in, gate_buf, weights->w_lang->layers[i].mlp_down, 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr);
// All_reduce if distributed
if (rsrc.comm != nullptr) {
RUN_INFINI(infinicclAllReduce(
logits_in->data(), logits_in->data(), ntok * d, dtype,
INFINICCL_SUM, rsrc.comm, stream));
RUN_INFINI(infinirtStreamSynchronize(stream));
}
}
// sample and output
if (idev == 0) {
if (last_logits != nullptr) {
rmsnorm(logits_out, logits_in, weights->w_lang->out_norm, epsilon);
auto last_logits_buf = Tensor::buffer(dtype, {ntok, dvoc}, rsrc.memory_pool);
linear(last_logits_buf, logits_out, weights->w_lang->out_embd, 1.0, 0.0, nullptr, nullptr);
RUN_INFINI(infinirtStreamSynchronize(stream));
RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dtype) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
}
if (output != nullptr) {
size_t token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
token_offset += seq_len;
rmsnorm(logits_out->slice(0, req, 1),
logits_in->slice(0, token_offset - 1, 1),
weights->w_lang->out_norm,
epsilon);
}
linear(prob_buf, logits_out->slice(0, 0, nreq), weights->w_lang->out_embd, 1.0, 0.0, nullptr, nullptr);
std::random_device _rd;
std::mt19937 gen(_rd());
token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
random_val, topp[req], topk[req], temperature[req]);
token_offset += seq_len;
}
RUN_INFINI(infinirtStreamSynchronize(stream));
RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
for (uint32_t req = 0; req < nreq; req++) {
output[req] = uint32_t(result_cpu[req]);
}
}
}
}
void inferDeviceBatch(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc,
uint32_t idev, uint32_t ndev, InferState &state, InferRequest &req) {
// infer vision + sync
if (req.num_images > 0 || req.num_videos > 0) {
inferDeviceBatchVision(meta, rsrc, idev, ndev, req);
std::unique_lock<std::mutex> lock(state.mtx_sync);
state.sync_cnt--;
if (state.sync_cnt == 0) {
state.cv_sync.notify_all();
} else {
state.cv_sync.wait(lock, [&] { return state.sync_cnt == 0; });
}
}
// infer text
inferDeviceBatchText(meta, rsrc, idev, ndev, req);
}
__INFINI_C void
inferBatchQwen3vl(struct Qwen3vlModel *model,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **kv_caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output) {
model->req.tokens = tokens;
model->req.ntok = ntok;
model->req.pixel_values = pixel_values;
model->req.total_patches = total_patches;
model->req.image_grid_thw = image_grid_thw;
model->req.num_images = num_images;
model->req.pixel_values_videos = pixel_values_videos;
model->req.total_patches_videos = total_patches_videos;
model->req.video_grid_thw = video_grid_thw;
model->req.num_videos = num_videos;
model->req.patch_features = patch_features;
model->req.req_lens = req_lens;
model->req.nreq = nreq;
model->req.req_pos = req_pos;
model->req.kv_caches = kv_caches;
model->req.output = output;
model->req.logits = nullptr;
model->req.temperature = temperature;
model->req.topk = topk;
model->req.topp = topp;
model->states[0].sync_cnt = model->dev_ids.size();
for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].proceed = true;
lock.unlock();
model->states[idev].cv_start.notify_one();
}
for (size_t i = model->dev_ids.size(); i > 0; i--) {
auto idev = i - 1;
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
lock.unlock();
}
}
__INFINI_C void
forwardBatchQwen3vl(struct Qwen3vlModel *model,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **kv_caches,
void *logits) {
model->req.tokens = tokens;
model->req.ntok = ntok;
model->req.pixel_values = pixel_values;
model->req.total_patches = total_patches;
model->req.image_grid_thw = image_grid_thw;
model->req.num_images = num_images;
model->req.pixel_values_videos = pixel_values_videos;
model->req.total_patches_videos = total_patches_videos;
model->req.video_grid_thw = video_grid_thw;
model->req.num_videos = num_videos;
model->req.patch_features = patch_features;
model->req.req_lens = req_lens;
model->req.nreq = nreq;
model->req.req_pos = req_pos;
model->req.kv_caches = kv_caches;
model->req.output = nullptr;
model->req.logits = logits;
model->req.temperature = nullptr;
model->req.topk = nullptr;
model->req.topp = nullptr;
model->states[0].sync_cnt = model->dev_ids.size();
for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].proceed = true;
lock.unlock();
model->states[idev].cv_start.notify_one();
}
for (size_t i = model->dev_ids.size(); i > 0; i--) {
auto idev = i - 1;
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
lock.unlock();
}
}
void launchDevice(const Qwen3vlMeta &meta, std::shared_ptr<Qwen3vlDeviceWeights> weights, Qwen3vlDeviceResource *rsrc, InferState &state, InferRequest &req,
infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
// Create Device Resource
createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
CacheManager cache_manager(100);
InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream);
// Set the inference context for this thread
setInferenceContext(&ctx);
{
std::unique_lock<std::mutex> lock(state.mtx);
state.loaded = true;
lock.unlock();
state.cv_load.notify_one();
}
// Infer Loop
while (true) {
std::unique_lock<std::mutex> lock(state.mtx);
state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
// quit if exit_flag is set
if (state.exit_flag) {
break;
}
inferDeviceBatch(meta, *rsrc, idev, ndev, state, req);
state.proceed = false;
lock.unlock();
state.cv_done.notify_one();
}
// Clean-Up
releaseDeviceResource(*rsrc);
setInferenceContext(nullptr); // Clear the context when done
}
Qwen3vlModel::Qwen3vlModel(const Qwen3vlMeta *_meta, const Qwen3vlWeights *weights) : meta(*_meta) {
auto device_weights = weights->device_weights;
int ndev = device_weights.size();
device = device_weights[0]->device;
dev_ids.resize(ndev);
for (int i = 0; i < ndev; i++) {
dev_ids[i] = device_weights[i]->dev_id;
}
dev_resources = std::vector<Qwen3vlDeviceResource>(ndev);
states = std::vector<InferState>(ndev);
threads.resize(ndev);
RUN_INFINI(infinirtInit());
auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
if (ndev > 1) {
RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
}
for (int i = 0; i < ndev; i++) {
threads[i] = std::thread(launchDevice, std::cref(meta), device_weights[i], &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
}
for (int i = 0; i < ndev; i++) {
std::unique_lock<std::mutex> lock(states[i].mtx);
states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
lock.unlock();
}
}
__INFINI_C struct Qwen3vlModel *
createQwen3vlModel(const Qwen3vlMeta *_meta,
const Qwen3vlWeights *weights) {
Qwen3vlModel *model = new Qwen3vlModel(_meta, weights);
return model;
}
__INFINI_C void
destroyQwen3vlModel(struct Qwen3vlModel *model) {
auto ndev = model->dev_resources.size();
for (size_t idev = 0; idev < ndev; idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].exit_flag = true;
lock.unlock();
model->states[idev].cv_start.notify_one();
}
for (size_t idev = 0; idev < ndev; idev++) {
model->threads[idev].join();
}
delete model;
}
#include "qwen3vl_impl.hpp"
__INFINI_C struct Qwen3vlCache *
createQwen3vlCache(const struct Qwen3vlModel *model) {
Qwen3vlCache *cache = new Qwen3vlCache();
auto ndev = model->dev_resources.size();
auto nlayer = model->meta.text_meta.num_hidden_layers;
auto max_len = model->meta.text_meta.max_tokens;
auto dh = model->meta.text_meta.head_dim;
auto nkv = model->meta.text_meta.num_key_value_heads / size_t(ndev);
auto k_rot_shape = std::vector<size_t>{max_len, nkv, dh};
auto v_shape = std::vector<size_t>{max_len, nkv, dh};
for (size_t idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
auto k_rot_cache = std::vector<std::shared_ptr<Tensor>>();
auto v_cache = std::vector<std::shared_ptr<Tensor>>();
for (size_t layer = 0; layer < nlayer; layer++) {
k_rot_cache.push_back(std::move(Tensor::buffer(model->meta.dtype, k_rot_shape)));
v_cache.push_back(std::move(Tensor::buffer(model->meta.dtype, v_shape)));
}
cache->k_rot.push_back(k_rot_cache);
cache->v.push_back(v_cache);
}
return cache;
}
//////还有visual deepstack需要cache?
__INFINI_C void
dropQwen3vlCache(const struct Qwen3vlModel *model,
struct Qwen3vlCache *cache) {
auto ndev = model->dev_resources.size();
auto nlayer = model->meta.text_meta.num_hidden_layers;
for (size_t idev = 0; idev < ndev; idev++) {
RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
for (size_t layer = 0; layer < nlayer; layer++) {
cache->k_rot[idev][layer].reset();
cache->v[idev][layer].reset();
}
}
delete cache;
}
#ifndef QWEN3VL_IMPL_H
#define QWEN3VL_IMPL_H
#include "infinicore_infer.h"
#include "../../allocator.hpp"
#include "../../tensor.hpp"
#include <condition_variable>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
struct Qwen3vlLayerWeight {
std::shared_ptr<Tensor> attn_norm;
std::shared_ptr<Tensor> attn_qkv_proj;
std::shared_ptr<Tensor> attn_q_norm;
std::shared_ptr<Tensor> attn_k_norm;
std::shared_ptr<Tensor> attn_o_proj;
std::shared_ptr<Tensor> mlp_norm;
std::shared_ptr<Tensor> mlp_gate_up;
std::shared_ptr<Tensor> mlp_down;
};
struct Qwen3vlLanguageModelWeight {
std::shared_ptr<Tensor> in_embd, out_embd, out_norm;
std::vector<Qwen3vlLayerWeight> layers;
};
struct Qwen3vlVisBlockWeight {
std::shared_ptr<Tensor> attn_proj_weight, attn_proj_bias, attn_qkv_weight, attn_qkv_bias;
std::shared_ptr<Tensor> mlp_linear_fc1_weight, mlp_linear_fc1_bias, mlp_linear_fc2_weight, mlp_linear_fc2_bias;
std::shared_ptr<Tensor> norm1_weight, norm1_bias, norm2_weight, norm2_bias;
};
struct DeepstackMergerWeight {
std::shared_ptr<Tensor> linear_fc1_weight, linear_fc1_bias, linear_fc2_weight, linear_fc2_bias;
std::shared_ptr<Tensor> norm_weight, norm_bias;
};
struct MergerWeight {
std::shared_ptr<Tensor> linear_fc1_weight, linear_fc1_bias, linear_fc2_weight, linear_fc2_bias;
std::shared_ptr<Tensor> norm_weight, norm_bias;
};
struct Qwen3vlVisualEncoderWeight {
std::shared_ptr<Tensor> patch_embed_weight, patch_embed_bias, pos_embed_weight;
std::vector<Qwen3vlVisBlockWeight> blocks;
std::vector<DeepstackMergerWeight> deepstack_mergers;
std::shared_ptr<MergerWeight> merger;
};
struct Qwen3vlDeviceWeights {
std::shared_ptr<Tensor> sin_table, cos_table;
std::shared_ptr<Qwen3vlLanguageModelWeight> w_lang;
std::shared_ptr<Qwen3vlVisualEncoderWeight> w_vis;
infiniDevice_t device;
int dev_id;
infinirtStream_t load_stream;
};
struct Qwen3vlWeights {
Qwen3vlMeta const *meta;
bool transpose_weight;
std::vector<std::shared_ptr<Qwen3vlDeviceWeights>> device_weights;
Qwen3vlWeights(const Qwen3vlMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids,
bool transpose_weight);
};
struct Qwen3vlDeviceResource {
// Device
infiniDevice_t device;
int device_id;
infiniopHandle_t handle;
// Weights
std::shared_ptr<Qwen3vlDeviceWeights> weights;
// Streams
infinirtStream_t stream;
// Communicator
infinicclComm_t comm;
std::shared_ptr<MemoryPool> memory_pool;
};
struct InferState { // qwen3vl namespace
inline static std::mutex mtx_sync;
inline static int sync_cnt;
inline static std::condition_variable cv_sync;
std::mutex mtx;
std::condition_variable cv_load, cv_start, cv_done;
bool loaded = false;
bool proceed = false;
bool exit_flag = false;
};
struct InferRequest { // qwen3vl namespace
const uint32_t *tokens;
uint32_t ntok;
void *pixel_values;
uint32_t total_patches;
uint32_t *image_grid_thw;
uint32_t num_images;
void *pixel_values_videos;
uint32_t total_patches_videos;
uint32_t *video_grid_thw;
uint32_t num_videos;
uint32_t patch_features;
const uint32_t *req_lens;
uint32_t nreq;
const uint32_t *req_pos;
struct Qwen3vlCache **kv_caches;
const float *temperature;
const uint32_t *topk;
const float *topp;
uint32_t *output;
void *logits;
};
struct Qwen3vlModel {
Qwen3vlMeta meta;
infiniDevice_t device;
std::vector<int> dev_ids;
std::vector<Qwen3vlDeviceResource> dev_resources;
std::vector<InferState> states;
std::vector<std::thread> threads;
InferRequest req;
Qwen3vlModel(const Qwen3vlMeta *, const Qwen3vlWeights *weights);
};
struct Qwen3vlCache {
std::vector<std::vector<std::shared_ptr<Tensor>>> k_rot, v;
};
#endif
#include "qwen3vl_impl.hpp"
#include <cmath>
inline std::shared_ptr<Tensor> getInEmbd(
const Qwen3vlMeta *meta) {
auto shape = std::vector<size_t>({meta->text_meta.vocab_size, meta->text_meta.hidden_size});
return Tensor::weight(nullptr, meta->dtype, shape);
}
inline std::shared_ptr<Tensor> getOutNorm(
const Qwen3vlMeta *meta) {
auto shape = std::vector<size_t>({meta->text_meta.hidden_size});
return Tensor::weight(nullptr, meta->dtype, shape);
}
inline std::shared_ptr<Tensor> getOutEmbd(
const Qwen3vlMeta *meta) {
auto shape = std::vector<size_t>({meta->text_meta.vocab_size, meta->text_meta.hidden_size});
return Tensor::weight(nullptr, meta->dtype, shape)
->permute({1, 0});
}
inline void getLayerWeight(
const Qwen3vlMeta *meta, Qwen3vlLayerWeight &layer, int ndev) {
auto nkvh = meta->text_meta.num_key_value_heads;
auto nh = meta->text_meta.num_attention_heads;
auto dh = meta->text_meta.head_dim;
auto d = meta->text_meta.hidden_size;
auto di = meta->text_meta.intermediate_size;
auto dh_shape = std::vector<size_t>({meta->text_meta.hidden_size});
layer.attn_norm = Tensor::weight(nullptr, meta->dtype, dh_shape);
auto qk_norm_shape = std::vector<size_t>({meta->text_meta.head_dim});
layer.attn_q_norm = Tensor::weight(nullptr, meta->dtype, qk_norm_shape);
layer.attn_k_norm = Tensor::weight(nullptr, meta->dtype, qk_norm_shape);
auto qkv_proj_shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
layer.attn_qkv_proj = Tensor::weight(nullptr, meta->dtype, qkv_proj_shape);
auto o_proj_shape = std::vector<size_t>({d, nh / ndev * dh});
layer.attn_o_proj = Tensor::weight(nullptr, meta->dtype, o_proj_shape);
layer.mlp_norm = Tensor::weight(nullptr, meta->dtype, dh_shape);
auto up_shape = std::vector<size_t>({2 * di / ndev, d});
layer.mlp_gate_up = Tensor::weight(nullptr, meta->dtype, up_shape);
auto down_shape = std::vector<size_t>({d, di / ndev});
layer.mlp_down = Tensor::weight(nullptr, meta->dtype, down_shape);
}
inline void getVisualWeight(
const Qwen3vlMeta *meta, std::shared_ptr<Qwen3vlVisualEncoderWeight> w_vis) {
Qwen3vlVisMeta vis_meta = meta->vis_meta;
auto patch_embed_shape = std::vector<size_t>({vis_meta.hidden_size, vis_meta.in_channels, vis_meta.temporal_patch_size, vis_meta.patch_size, vis_meta.patch_size});
w_vis->patch_embed_weight = Tensor::weight(nullptr, meta->dtype, patch_embed_shape);
w_vis->patch_embed_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->pos_embed_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.num_position_embeddings, vis_meta.hidden_size});
w_vis->merger = std::make_shared<MergerWeight>();
w_vis->merger->linear_fc1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size, vis_meta.intermediate_size});
w_vis->merger->linear_fc2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size, vis_meta.intermediate_size});
w_vis->merger->linear_fc1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size});
w_vis->merger->linear_fc2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size});
w_vis->merger->norm_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->merger->norm_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->blocks = std::vector<Qwen3vlVisBlockWeight>(vis_meta.depth);
for (size_t i = 0; i < vis_meta.depth; i++) {
w_vis->blocks[i].attn_proj_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size, vis_meta.hidden_size});
w_vis->blocks[i].attn_proj_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->blocks[i].attn_qkv_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.in_channels * vis_meta.hidden_size, vis_meta.hidden_size});
w_vis->blocks[i].attn_qkv_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.in_channels * vis_meta.hidden_size});
w_vis->blocks[i].mlp_linear_fc1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size, vis_meta.hidden_size});
w_vis->blocks[i].mlp_linear_fc1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size});
w_vis->blocks[i].mlp_linear_fc2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size, vis_meta.intermediate_size});
w_vis->blocks[i].mlp_linear_fc2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->blocks[i].norm1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->blocks[i].norm1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->blocks[i].norm2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
w_vis->blocks[i].norm2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size});
}
w_vis->deepstack_mergers = std::vector<DeepstackMergerWeight>(3);
for (size_t i = 0; i < 3; i++) {
w_vis->deepstack_mergers[i].linear_fc1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size, vis_meta.intermediate_size});
w_vis->deepstack_mergers[i].linear_fc2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size, vis_meta.intermediate_size});
w_vis->deepstack_mergers[i].linear_fc1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size});
w_vis->deepstack_mergers[i].linear_fc2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size});
w_vis->deepstack_mergers[i].norm_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size});
w_vis->deepstack_mergers[i].norm_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size});
}
}
inline std::shared_ptr<Tensor> getSinTable(const Qwen3vlMeta *meta) {
auto half_dh = meta->text_meta.head_dim / 2;
auto unit = dsize(meta->dtype);
void *table = std::malloc(meta->text_meta.max_tokens * half_dh * unit);
for (size_t i = 0; i < meta->text_meta.max_tokens; i++) {
for (size_t j = 0; j < half_dh; j++) {
float _sin = std::sin(
static_cast<float>(i) / std::pow(meta->text_meta.rope_theta, static_cast<float>(j) / half_dh));
if (meta->dtype == INFINI_DTYPE_F16) {
((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin);
} else if (meta->dtype == INFINI_DTYPE_BF16) {
((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_sin);
} else if (meta->dtype == INFINI_DTYPE_F32) {
((float *)table)[i * half_dh + j] = _sin;
} else {
std::cout << "unsupported data type" << std::endl;
exit(1);
}
}
}
auto shape = std::vector<size_t>({meta->text_meta.max_tokens, half_dh});
auto tensor = Tensor::weight(table, meta->dtype, shape);
std::free(table);
return tensor;
}
inline std::shared_ptr<Tensor> getCosTable(const Qwen3vlMeta *meta) {
auto half_dh = meta->text_meta.head_dim / 2;
auto unit = dsize(meta->dtype);
void *table = std::malloc(meta->text_meta.max_tokens * half_dh * unit);
for (size_t i = 0; i < meta->text_meta.max_tokens; i++) {
for (size_t j = 0; j < half_dh; j++) {
float _cos = std::cos(
static_cast<float>(i) / std::pow(meta->text_meta.rope_theta, static_cast<float>(j) / half_dh));
if (meta->dtype == INFINI_DTYPE_F16) {
((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos);
} else if (meta->dtype == INFINI_DTYPE_BF16) {
((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_cos);
} else if (meta->dtype == INFINI_DTYPE_F32) {
((float *)table)[i * half_dh + j] = _cos;
} else {
std::cout << "unsupported data type" << std::endl;
exit(1);
}
}
}
auto shape = std::vector<size_t>({meta->text_meta.max_tokens, half_dh});
auto tensor = Tensor::weight(table, meta->dtype, shape);
std::free(table);
return tensor;
}
Qwen3vlWeights::Qwen3vlWeights(
const Qwen3vlMeta *_meta, infiniDevice_t device, int ndev, const int *dev_ids, bool _transpose_weight) {
meta = _meta;
transpose_weight = _transpose_weight;
device_weights = std::vector<std::shared_ptr<Qwen3vlDeviceWeights>>(ndev);
for (int dev = 0; dev < ndev; dev++) {
int dev_id = dev_ids[dev];
RUN_INFINI(infinirtSetDevice(device, dev_id));
device_weights[dev] = std::make_shared<Qwen3vlDeviceWeights>();
device_weights[dev]->device = device;
device_weights[dev]->dev_id = dev_id;
RUN_INFINI(infinirtStreamCreate(&device_weights[dev]->load_stream));
device_weights[dev]->w_lang = std::make_shared<Qwen3vlLanguageModelWeight>();
device_weights[dev]->w_vis = std::make_shared<Qwen3vlVisualEncoderWeight>();
device_weights[dev]->w_lang->in_embd = getInEmbd(meta);
device_weights[dev]->w_lang->out_norm = getOutNorm(meta);
device_weights[dev]->w_lang->out_embd = getOutEmbd(meta);
device_weights[dev]->sin_table = getSinTable(meta);
device_weights[dev]->cos_table = getCosTable(meta);
device_weights[dev]->w_lang->layers = std::vector<Qwen3vlLayerWeight>(meta->text_meta.num_hidden_layers);
for (size_t layer = 0; layer < meta->text_meta.num_hidden_layers; layer++) {
getLayerWeight(meta, device_weights[dev]->w_lang->layers[layer], ndev);
}
getVisualWeight(meta, device_weights[dev]->w_vis);
}
}
//--- Lang Global
void load_input_embd(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading input embedding from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->in_embd->load(cpu_ptr, weight->load_stream);
}
}
void load_output_norm(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading output norm from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->out_norm->load(cpu_ptr, weight->load_stream);
}
}
void load_output_embd(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading output embedding from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->out_embd->load(cpu_ptr, weight->load_stream);
if (weights->transpose_weight) {
weight->w_lang->out_embd->permute({1, 0}); //[d,voc]
}
}
}
// --- Attention
void load_attn_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading attention norm " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].attn_norm->load(cpu_ptr, weight->load_stream);
}
}
void load_attn_q_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading attention q_norm " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].attn_q_norm->load(cpu_ptr, weight->load_stream);
}
}
void load_attn_qkv_proj(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading attention q_proj " << layer << " from " << cpu_ptr << std::endl;
int ndev = int(weights->device_weights.size());
auto nkvh = weights->meta->text_meta.num_key_value_heads;
auto nh = weights->meta->text_meta.num_attention_heads;
auto dh = weights->meta->text_meta.head_dim;
auto d = weights->meta->text_meta.hidden_size;
//[ndev,nh+2*nkvh,dh,d]
for (int idev = 0; idev < ndev; idev++) {
auto weight = weights->device_weights[idev];
size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(weights->meta->dtype);
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].attn_qkv_proj->load((char *)cpu_ptr + offset, weight->load_stream);
if (weights->transpose_weight) {
weight->w_lang->layers[layer].attn_qkv_proj = weight->w_lang->layers[layer].attn_qkv_proj->permute({1, 0}); //[d, (nh+2*nkvh)*dh]
}
}
}
void load_attn_k_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading attention k_norm " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].attn_k_norm->load(cpu_ptr, weight->load_stream);
}
}
void load_attn_o_proj(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading attention o_proj " << layer << " from " << cpu_ptr << std::endl;
int ndev = int(weights->device_weights.size());
auto nh = weights->meta->text_meta.num_attention_heads;
auto dh = weights->meta->text_meta.head_dim;
auto d = weights->meta->text_meta.hidden_size;
// [ndev, d, nh // ndev * dh]
for (int idev = 0; idev < ndev; idev++) {
auto weight = weights->device_weights[idev];
size_t offset = idev * d * (nh / ndev * dh) * dsize(weights->meta->dtype);
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].attn_o_proj->load((char *)cpu_ptr + offset, weight->load_stream);
if (weights->transpose_weight) {
weight->w_lang->layers[layer].attn_o_proj = weight->w_lang->layers[layer].attn_o_proj->permute({1, 0}); //[nh/ndev*dh, d]
}
}
}
// --- MLP
void load_mlp_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading mlp norm " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].mlp_norm->load(cpu_ptr, weight->load_stream);
}
}
void load_mlp_gate_up(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading mlp gate " << layer << " from " << cpu_ptr << std::endl;
int ndev = int(weights->device_weights.size());
auto di = weights->meta->text_meta.intermediate_size;
auto d = weights->meta->text_meta.hidden_size;
// [ndev, 2*di // ndev, d]
for (int idev = 0; idev < ndev; idev++) {
auto weight = weights->device_weights[idev];
size_t offset = idev * (2 * di / ndev) * d * dsize(weights->meta->dtype);
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].mlp_gate_up->load((char *)cpu_ptr + offset, weight->load_stream);
if (weights->transpose_weight) {
weight->w_lang->layers[layer].mlp_gate_up = weight->w_lang->layers[layer].mlp_gate_up->permute({1, 0}); //[d, 2*di/ndev]
}
}
}
void load_mlp_down(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading mlp down " << layer << " from " << cpu_ptr << std::endl;
int ndev = int(weights->device_weights.size());
auto di = weights->meta->text_meta.intermediate_size;
auto d = weights->meta->text_meta.hidden_size;
//[ndev, d, di // ndev]
for (int idev = 0; idev < ndev; idev++) {
auto weight = weights->device_weights[idev];
size_t offset = idev * d * (di / ndev) * dsize(weights->meta->dtype);
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_lang->layers[layer].mlp_down->load((char *)cpu_ptr + offset, weight->load_stream);
if (weights->transpose_weight) {
weight->w_lang->layers[layer].mlp_down = weight->w_lang->layers[layer].mlp_down->permute({1, 0}); //[di/ndev, d]
}
}
}
// --- Vision weights
void load_patch_embed_weight(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading patch embed weight from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->patch_embed_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_patch_embed_bias(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading patch embed bias from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->patch_embed_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_pos_embed_weight(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading pos embed weight from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->pos_embed_weight->load(cpu_ptr, weight->load_stream);
}
}
// Vision block attention
void load_attn_proj_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision attn proj weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].attn_proj_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_attn_proj_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision attn proj bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].attn_proj_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_attn_qkv_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision attn qkv weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].attn_qkv_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_attn_qkv_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision attn qkv bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].attn_qkv_bias->load(cpu_ptr, weight->load_stream);
}
}
// Vision block mlp
void load_mlp_linear_fc1_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision mlp fc1 weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].mlp_linear_fc1_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_mlp_linear_fc1_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision mlp fc1 bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].mlp_linear_fc1_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_mlp_linear_fc2_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision mlp fc2 weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].mlp_linear_fc2_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_mlp_linear_fc2_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision mlp fc2 bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].mlp_linear_fc2_bias->load(cpu_ptr, weight->load_stream);
}
}
// Vision block norm
void load_norm1_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision norm1 weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].norm1_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_norm1_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision norm1 bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].norm1_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_norm2_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision norm2 weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].norm2_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_norm2_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading vision norm2 bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->blocks[layer].norm2_bias->load(cpu_ptr, weight->load_stream);
}
}
// Deepstack merger
void load_deepstack_merger_linear_fc1_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading deepstack merger fc1 weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->deepstack_mergers[layer].linear_fc1_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_deepstack_merger_linear_fc1_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading deepstack merger fc1 bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->deepstack_mergers[layer].linear_fc1_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_deepstack_merger_linear_fc2_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading deepstack merger fc2 weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->deepstack_mergers[layer].linear_fc2_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_deepstack_merger_linear_fc2_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading deepstack merger fc2 bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->deepstack_mergers[layer].linear_fc2_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_deepstack_merger_norm_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading deepstack merger norm weight " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->deepstack_mergers[layer].norm_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_deepstack_merger_norm_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) {
std::cout << "Loading deepstack merger norm bias " << layer << " from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->deepstack_mergers[layer].norm_bias->load(cpu_ptr, weight->load_stream);
}
}
// Merger
void load_merger_linear_fc1_weight(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading merger fc1 weight from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->merger->linear_fc1_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_merger_linear_fc1_bias(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading merger fc1 bias from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->merger->linear_fc1_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_merger_linear_fc2_weight(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading merger fc2 weight from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->merger->linear_fc2_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_merger_linear_fc2_bias(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading merger fc2 bias from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->merger->linear_fc2_bias->load(cpu_ptr, weight->load_stream);
}
}
void load_merger_norm_weight(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading merger norm weight from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->merger->norm_weight->load(cpu_ptr, weight->load_stream);
}
}
void load_merger_norm_bias(Qwen3vlWeights *weights, void *cpu_ptr) {
std::cout << "Loading merger norm bias from " << cpu_ptr << std::endl;
for (int dev = 0; dev < int(weights->device_weights.size()); dev++) {
auto weight = weights->device_weights[dev];
RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id));
weight->w_vis->merger->norm_bias->load(cpu_ptr, weight->load_stream);
}
}
static Qwen3vlWeightLoader weight_loader = {
// Language model loaders
.lang_loader = {
.load_input_embd = load_input_embd,
.load_output_norm = load_output_norm,
.load_output_embd = load_output_embd,
.load_attn_norm = load_attn_norm,
.load_attn_q_norm = load_attn_q_norm,
.load_attn_k_norm = load_attn_k_norm,
.load_attn_qkv_proj = load_attn_qkv_proj,
.load_attn_o_proj = load_attn_o_proj,
.load_mlp_norm = load_mlp_norm,
.load_mlp_gate_up = load_mlp_gate_up,
.load_mlp_down = load_mlp_down,
},
// Vision model loaders
.vis_loader = {
.load_patch_embed_weight = load_patch_embed_weight,
.load_patch_embed_bias = load_patch_embed_bias,
.load_pos_embed_weight = load_pos_embed_weight,
.load_attn_proj_weight = load_attn_proj_weight,
.load_attn_proj_bias = load_attn_proj_bias,
.load_attn_qkv_weight = load_attn_qkv_weight,
.load_attn_qkv_bias = load_attn_qkv_bias,
.load_mlp_linear_fc1_weight = load_mlp_linear_fc1_weight,
.load_mlp_linear_fc1_bias = load_mlp_linear_fc1_bias,
.load_mlp_linear_fc2_weight = load_mlp_linear_fc2_weight,
.load_mlp_linear_fc2_bias = load_mlp_linear_fc2_bias,
.load_norm1_weight = load_norm1_weight,
.load_norm1_bias = load_norm1_bias,
.load_norm2_weight = load_norm2_weight,
.load_norm2_bias = load_norm2_bias,
.load_deepstack_merger_linear_fc1_weight = load_deepstack_merger_linear_fc1_weight,
.load_deepstack_merger_linear_fc1_bias = load_deepstack_merger_linear_fc1_bias,
.load_deepstack_merger_linear_fc2_weight = load_deepstack_merger_linear_fc2_weight,
.load_deepstack_merger_linear_fc2_bias = load_deepstack_merger_linear_fc2_bias,
.load_deepstack_merger_norm_weight = load_deepstack_merger_norm_weight,
.load_deepstack_merger_norm_bias = load_deepstack_merger_norm_bias,
.load_merger_linear_fc1_weight = load_merger_linear_fc1_weight,
.load_merger_linear_fc1_bias = load_merger_linear_fc1_bias,
.load_merger_linear_fc2_weight = load_merger_linear_fc2_weight,
.load_merger_linear_fc2_bias = load_merger_linear_fc2_bias,
.load_merger_norm_weight = load_merger_norm_weight,
.load_merger_norm_bias = load_merger_norm_bias,
}};
__INFINI_C Qwen3vlWeights *
createQwen3vlWeights(const Qwen3vlMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids,
bool transpose_weight) {
printf("=== C++ createQwen3vlWeights ===\n");
printf("sizeof(Qwen3vlTextMeta): %zu\n", sizeof(Qwen3vlTextMeta));
printf("sizeof(Qwen3vlVisMeta): %zu\n", sizeof(Qwen3vlVisMeta));
printf("sizeof(Qwen3vlMeta): %zu\n", sizeof(Qwen3vlMeta));
printf("meta->dtype: %d\n", meta->dtype);
printf("meta->text_meta.hidden_size: %zu\n", meta->text_meta.hidden_size);
printf("meta->text_meta.num_hidden_layers: %zu\n", meta->text_meta.num_hidden_layers);
printf("meta->text_meta.vocab_size: %zu\n", meta->text_meta.vocab_size);
printf("meta->vis_meta.depth: %zu\n", meta->vis_meta.depth);
printf("device: %d, ndev: %d, dev_ids[0]: %d\n", device, ndev, dev_ids[0]);
fflush(stdout);
auto weights = new Qwen3vlWeights(meta, device, ndev, dev_ids, transpose_weight);
return weights;
};
__INFINI_C Qwen3vlWeightLoader *
createQwen3vlWeightLoader() {
return &weight_loader;
}
...@@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector<size_t> &shape, ...@@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector<size_t> &shape,
std::cout << std::endl; std::cout << std::endl;
} else if (dim < shape.size() - 1) { } else if (dim < shape.size() - 1) {
for (size_t i = 0; i < shape[dim]; i++) { for (size_t i = 0; i < shape[dim]; i++) {
print_data(data + i * strides[dim], shape, strides, dim + 1); print_data_bf16(data + i * strides[dim], shape, strides, dim + 1);
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment