llama.hpp 9.9 KB
Newer Older
1
2
3
#pragma once

#include "../../cache/kv_cache.hpp"
4
5
6
#include "../../models/debug_utils/hooks.hpp"
#include "../../models/llama/llama.hpp"
#include "../../models/llama/llama_attention.hpp"
7
8
#include "infinicore/device.hpp"
#include "infinicore/nn/module.hpp"
PanZezhong's avatar
PanZezhong committed
9
#include "infinicore/nn/rope.hpp"
10
11
12
13
#include "infinicore/tensor.hpp"
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
14
15
16
17
18
19
20
21
22
23
24
25

namespace py = pybind11;
using infinicore::Device;
using infinilm::models::debug_utils::HookRegistry;

namespace infinilm::models::llama {

inline void bind_llama(py::module &m) {
    // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
    // Bind HookRegistry
    py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
        .def(py::init<>())
26
27
28
29
30
31
32
33
34
35
36
37
38
39
        .def(
            "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
                // Convert Python callable to C++ function
                self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
                    try {
                        // Call Python callback with hook name, tensor, and layer index
                        callback(hook_name, tensor, layer_idx);
                    } catch (const py::error_already_set &e) {
                        // Re-raise Python exception
                        throw;
                    }
                });
            },
            py::arg("name"), py::arg("callback"))
40
41
42
        .def("clear", &HookRegistry::clear)
        .def("has_hooks", &HookRegistry::has_hooks);

Jiacheng Huang's avatar
Jiacheng Huang committed
43
44
    py::class_<InfinilmModel::Config> config(m, "Config");

45
    // Bind LlamaConfig
Jiacheng Huang's avatar
Jiacheng Huang committed
46
47
    py::class_<LlamaConfig, InfinilmModel::Config> llama_config(m, "LlamaConfig");
    llama_config
48
        .def(py::init<>())
49
50
        // TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism.
        .def_readwrite("_dtype", &LlamaConfig::dtype)
51
52
53
54
55
56
57
58
59
60
61
62
63
        .def_readwrite("vocab_size", &LlamaConfig::vocab_size)
        .def_readwrite("hidden_size", &LlamaConfig::hidden_size)
        .def_readwrite("intermediate_size", &LlamaConfig::intermediate_size)
        .def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers)
        .def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads)
        .def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads)
        .def_readwrite("head_dim", &LlamaConfig::head_dim)
        .def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings)
        .def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps)
        .def_readwrite("hidden_act", &LlamaConfig::hidden_act)
        .def_readwrite("model_type", &LlamaConfig::model_type)
        .def_readwrite("rope_theta", &LlamaConfig::rope_theta)
        .def_readwrite("attention_bias", &LlamaConfig::attention_bias)
Ceng's avatar
Ceng committed
64
        .def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias)
65
66
        .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
        .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
wangpengcheng's avatar
wangpengcheng committed
67
        .def_readwrite("qk_norm", &LlamaConfig::qk_norm)
68
        .def_readwrite("use_cache", &LlamaConfig::use_cache)
Ceng's avatar
Ceng committed
69
70
71
72
        .def_readwrite("attention_dropout", &LlamaConfig::attention_dropout)
        .def_readwrite("initializer_range", &LlamaConfig::initializer_range)
        .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
        .def_readwrite("name_or_path", &LlamaConfig::name_or_path)
73
        .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
PanZezhong's avatar
PanZezhong committed
74
75
        .def_property(
            "bos_token_id", [](const LlamaConfig &self) {
Ceng's avatar
Ceng committed
76
                // Always return as list to match Python config format
Your Name's avatar
Your Name committed
77
                return py::cast(self.bos_token_id); }, [](LlamaConfig &self, py::object value) {
Ceng's avatar
Ceng committed
78
79
80
81
82
83
84
                // Accept both single int and list
                if (py::isinstance<py::int_>(value)) {
                    self.bos_token_id = {value.cast<int64_t>()};
                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
                    self.bos_token_id = value.cast<std::vector<int64_t>>();
                } else {
                    throw py::type_error("bos_token_id must be int or list of ints");
Your Name's avatar
Your Name committed
85
                } })
PanZezhong's avatar
PanZezhong committed
86
87
        .def_property(
            "eos_token_id", [](const LlamaConfig &self) {
Ceng's avatar
Ceng committed
88
                // Always return as list to match Python config format
Your Name's avatar
Your Name committed
89
                return py::cast(self.eos_token_id); }, [](LlamaConfig &self, py::object value) {
Ceng's avatar
Ceng committed
90
91
92
93
94
95
96
                // Accept both single int and list
                if (py::isinstance<py::int_>(value)) {
                    self.eos_token_id = {value.cast<int64_t>()};
                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
                    self.eos_token_id = value.cast<std::vector<int64_t>>();
                } else {
                    throw py::type_error("eos_token_id must be int or list of ints");
Your Name's avatar
Your Name committed
97
                } })
PanZezhong's avatar
PanZezhong committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
        .def_property(
            "rope_scaling",

            // ---------- getter ----------
            [](const LlamaConfig &self) -> py::object {
                if (!self.rope_scaling) {
                    return py::none();
                }

                using ScalingConfig = infinicore::nn::RoPE::ScalingConfig;
                using LongRopeConfig = infinicore::nn::RoPE::LongRopeConfig;

                py::dict d;

                if (auto *lr = dynamic_cast<const LongRopeConfig *>(self.rope_scaling.get())) {
                    d["type"] = "longrope";
                    d["rope_type"] = "longrope";
                    d["factor"] = lr->factor();
                    d["original_max_position_embeddings"] = lr->original_max_position_embeddings();
                    d["short_factor"] = lr->short_factor();
                    d["long_factor"] = lr->long_factor();
                } else {
                    throw std::runtime_error("Unknown RoPE scaling type");
                }

                return std::move(d);
            },

            // ---------- setter ----------
            [](LlamaConfig &self, py::object value) {
                if (value.is_none()) {
                    self.rope_scaling.reset();
                    return;
                }

                if (!py::isinstance<py::dict>(value)) {
                    throw py::type_error("rope_scaling must be a dict or None");
                }

                py::dict d = value.cast<py::dict>();

                auto get_str = [&](const char *k) {
                    if (!d.contains(k)) {
                        throw py::key_error(k);
                    }
                    return py::cast<std::string>(d[k]);
                };

                std::string type = d.contains("rope_type")
                                     ? py::cast<std::string>(d["rope_type"])
                                     : get_str("type");

                if (type == "longrope") {
                    using LongRopeConfig = infinicore::nn::RoPE::LongRopeConfig;

                    if (!d.contains("short_factor") || !d.contains("long_factor") || !d.contains("original_max_position_embeddings")) {
                        throw py::value_error(
                            "longrope requires short_factor, long_factor, "
                            "original_max_position_embeddings");
                    }

                    std::vector<float> short_factor = py::cast<std::vector<float>>(d["short_factor"]);
                    std::vector<float> long_factor = py::cast<std::vector<float>>(d["long_factor"]);

                    size_t original_max_position_embeddings = py::cast<size_t>(d["original_max_position_embeddings"]);

                    float factor = 1.0f;
                    if (d.contains("factor")) {
                        factor = py::cast<float>(d["factor"]);
                    }

                    self.rope_scaling = std::make_shared<LongRopeConfig>(
                        std::move(short_factor),
                        std::move(long_factor),
                        original_max_position_embeddings,
                        factor);
                } else {
                    throw py::value_error("Unsupported rope_scaling type: " + type);
                }
            })
Ceng's avatar
Ceng committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
        .def("validate", &LlamaConfig::validate)
        .def("kv_dim", &LlamaConfig::kv_dim)
        // Add __dir__ to make attributes discoverable via dir() in Python
        .def("__dir__", [](const LlamaConfig &self) {
            py::list dir_list;
            dir_list.append("vocab_size");
            dir_list.append("hidden_size");
            dir_list.append("intermediate_size");
            dir_list.append("num_hidden_layers");
            dir_list.append("num_attention_heads");
            dir_list.append("num_key_value_heads");
            dir_list.append("head_dim");
            dir_list.append("max_position_embeddings");
            dir_list.append("rms_norm_eps");
            dir_list.append("hidden_act");
            dir_list.append("model_type");
            dir_list.append("rope_theta");
PanZezhong's avatar
PanZezhong committed
195
            dir_list.append("rope_scaling");
Ceng's avatar
Ceng committed
196
197
198
199
            dir_list.append("attention_bias");
            dir_list.append("attention_output_bias");
            dir_list.append("mlp_bias");
            dir_list.append("tie_word_embeddings");
wangpengcheng's avatar
wangpengcheng committed
200
            dir_list.append("qk_norm");
Ceng's avatar
Ceng committed
201
202
203
204
205
206
207
208
209
210
            dir_list.append("use_cache");
            dir_list.append("attention_dropout");
            dir_list.append("initializer_range");
            dir_list.append("pretraining_tp");
            dir_list.append("name_or_path");
            dir_list.append("pad_token_id");
            dir_list.append("bos_token_id");
            dir_list.append("eos_token_id");
            dir_list.append("validate");
            dir_list.append("kv_dim");
Your Name's avatar
Your Name committed
211
            return dir_list; });
212

Ceng's avatar
Ceng committed
213
    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
214
215
216
}

} // namespace infinilm::models::llama