deepseek.h 5.26 KB
Newer Older
blkmjsian's avatar
blkmjsian committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#ifndef DEEPSEEK_V3_WEIGHTS_H
#define DEEPSEEK_V3_WEIGHTS_H

#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>

#include <stddef.h>
#include <stdint.h>

struct DeepSeekV3Weights;

// Function pointer signatures
typedef void (*load_global_fn)(DeepSeekV3Weights *, void *cpu_ptr);
typedef void (*load_layer_fn)(DeepSeekV3Weights *, void *cpu_ptr, size_t layer_id);
typedef void (*load_layer_linear_fn)(DeepSeekV3Weights *, void *weight_ptr, void *scale_ptr, void *zero_ptr, size_t layer_id);
typedef void (*load_layer_mlp_fn)(
    DeepSeekV3Weights *,
    void *gate_weight_ptr, void *gate_scale_ptr, void *gate_zero_ptr,
    void *up_weight_ptr, void *up_scale_ptr, void *up_zero_ptr,
    void *down_weight_ptr, void *down_scale_ptr, void *down_zero_ptr,
    size_t layer_id);
typedef void (*load_layer_expert_mlp_fn)(
    DeepSeekV3Weights *,
    void *gate_weight_ptr, void *gate_scale_ptr, void *gate_zero_ptr,
    void *up_weight_ptr, void *up_scale_ptr, void *up_zero_ptr,
    void *down_weight_ptr, void *down_scale_ptr, void *down_zero_ptr,
    size_t layer_id, size_t expert_id);

// Struct containing all weight loading functions
typedef struct {
    // Global
    load_global_fn load_input_embd;
    load_global_fn load_output_norm;
    load_global_fn load_output_embd;

    // Attention
    load_layer_fn load_attn_norm;
    load_layer_linear_fn load_attn_q_a_proj;
    load_layer_fn load_attn_q_a_layernorm;
    load_layer_linear_fn load_attn_q_b_proj;
    load_layer_linear_fn load_attn_kv_a_proj_with_mqa;
    load_layer_fn load_attn_kv_a_layernorm;
    load_layer_linear_fn load_attn_kv_b_proj;
    load_layer_linear_fn load_attn_o_proj;

    // MLP
    load_layer_fn load_mlp_norm;
    // MLP dense part
    load_layer_mlp_fn load_mlp_dense;

    // MLP sparse gating
    load_layer_fn load_mlp_gate_weight;
    load_layer_fn load_mlp_gate_bias;

    // Shared experts
    load_layer_mlp_fn load_mlp_shared_experts;

    // Per-expert functions
    load_layer_expert_mlp_fn load_mlp_experts;

} DeepSeekV3WeightLoader;

struct DeepSeekV3Model;

typedef struct {
    infiniDtype_t dt_logits;
    infiniDtype_t dt_norm;
    infiniDtype_t dt_quant_weight;
    infiniDtype_t dt_quant_scale;
    infiniDtype_t dt_quant_zero;
    infiniDtype_t dt_gate_weight;
    infiniDtype_t dt_gate_bias;

    size_t n_sparse_layer;
    size_t n_dense_layer;
    size_t d;
    size_t nh;
    size_t nkvh;
    size_t d_rope;
    size_t d_nope;
    size_t r_q;
    size_t r_kv;
    size_t d_qk;
    size_t d_v;

    float routed_scale;
    size_t nexperts;
    size_t kexperts;
    size_t di;
    size_t di_moe;
    size_t dctx;
    size_t dvoc;

    float epsilon;
    float rope_theta;
    uint32_t end_token;

} DeepSeekV3Meta;

//////////////////// APIs ///////////////////////
/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
106
__INFINI_C __export struct DeepSeekV3Model *
blkmjsian's avatar
blkmjsian committed
107
108
109
createDeepSeekV3Model(const DeepSeekV3Meta *,
                      const DeepSeekV3Weights *);

110
__INFINI_C DeepSeekV3Weights *
blkmjsian's avatar
blkmjsian committed
111
112
113
114
115
createDeepSeekV3Weights(const DeepSeekV3Meta *meta,
                        infiniDevice_t device,
                        int ndev,
                        const int *dev_ids);

116
__INFINI_C __export DeepSeekV3WeightLoader *
blkmjsian's avatar
blkmjsian committed
117
118
119
createDeepSeekV3WeightLoader();

/// @brief 销毁模型
120
__INFINI_C __export void destroyDeepSeekV3Model(struct DeepSeekV3Model *);
blkmjsian's avatar
blkmjsian committed
121

122
__INFINI_C __export struct DeepSeekV3Cache *
blkmjsian's avatar
blkmjsian committed
123
124
createDeepSeekV3Cache(const struct DeepSeekV3Model *);

125
__INFINI_C __export void
blkmjsian's avatar
blkmjsian committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
dropDeepSeekV3Cache(const struct DeepSeekV3Model *,
                    struct DeepSeekV3Cache *);

/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
140
__INFINI_C __export void
blkmjsian's avatar
blkmjsian committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
inferBatchDeepSeekV3(struct DeepSeekV3Model *,
                     const uint32_t *tokens, uint32_t ntok,
                     const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
                     struct DeepSeekV3Cache **caches,
                     const float *temperature, const uint32_t *topk, const float *topp,
                     uint32_t *output);

/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
156
__INFINI_C __export void
blkmjsian's avatar
blkmjsian committed
157
158
159
160
161
162
163
forwardBatchDeepSeekV3(struct DeepSeekV3Model *,
                       const uint32_t *tokens, uint32_t ntok,
                       const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
                       struct DeepSeekV3Cache **caches,
                       void *logits);

#endif // DEEPSEEK_V3_WEIGHTS_H