llama.hpp 11.3 KB
Newer Older
1
2
3
#pragma once

#include "../../cache/kv_cache.hpp"
4
5
6
#include "../../models/debug_utils/hooks.hpp"
#include "../../models/llama/llama.hpp"
#include "../../models/llama/llama_attention.hpp"
7
8
#include "infinicore/device.hpp"
#include "infinicore/nn/module.hpp"
9
10
11
12
#include "infinicore/tensor.hpp"
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
13
14
15
16
17
18
19
20
21
22
23
24

namespace py = pybind11;
using infinicore::Device;
using infinilm::models::debug_utils::HookRegistry;

namespace infinilm::models::llama {

inline void bind_llama(py::module &m) {
    // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
    // Bind HookRegistry
    py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
        .def(py::init<>())
25
26
27
28
29
30
31
32
33
34
35
36
37
38
        .def(
            "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
                // Convert Python callable to C++ function
                self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
                    try {
                        // Call Python callback with hook name, tensor, and layer index
                        callback(hook_name, tensor, layer_idx);
                    } catch (const py::error_already_set &e) {
                        // Re-raise Python exception
                        throw;
                    }
                });
            },
            py::arg("name"), py::arg("callback"))
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
        .def("clear", &HookRegistry::clear)
        .def("has_hooks", &HookRegistry::has_hooks);

    // Bind LlamaConfig
    py::class_<LlamaConfig> config(m, "LlamaConfig");
    config
        .def(py::init<>())
        .def_readwrite("vocab_size", &LlamaConfig::vocab_size)
        .def_readwrite("hidden_size", &LlamaConfig::hidden_size)
        .def_readwrite("intermediate_size", &LlamaConfig::intermediate_size)
        .def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers)
        .def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads)
        .def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads)
        .def_readwrite("head_dim", &LlamaConfig::head_dim)
        .def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings)
        .def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps)
        .def_readwrite("hidden_act", &LlamaConfig::hidden_act)
        .def_readwrite("model_type", &LlamaConfig::model_type)
        .def_readwrite("rope_theta", &LlamaConfig::rope_theta)
        .def_readwrite("attention_bias", &LlamaConfig::attention_bias)
Ceng's avatar
Ceng committed
59
        .def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias)
60
61
62
        .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
        .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
        .def_readwrite("use_cache", &LlamaConfig::use_cache)
Ceng's avatar
Ceng committed
63
64
65
66
        .def_readwrite("attention_dropout", &LlamaConfig::attention_dropout)
        .def_readwrite("initializer_range", &LlamaConfig::initializer_range)
        .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
        .def_readwrite("name_or_path", &LlamaConfig::name_or_path)
67
        .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
Ceng's avatar
Ceng committed
68
69
70
71
72
73
74
75
76
77
78
79
80
        .def_property("bos_token_id",
            [](const LlamaConfig &self) {
                // Always return as list to match Python config format
                return py::cast(self.bos_token_id);
            },
            [](LlamaConfig &self, py::object value) {
                // Accept both single int and list
                if (py::isinstance<py::int_>(value)) {
                    self.bos_token_id = {value.cast<int64_t>()};
                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
                    self.bos_token_id = value.cast<std::vector<int64_t>>();
                } else {
                    throw py::type_error("bos_token_id must be int or list of ints");
81
                }
Ceng's avatar
Ceng committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
            })
        .def_property("eos_token_id",
            [](const LlamaConfig &self) {
                // Always return as list to match Python config format
                return py::cast(self.eos_token_id);
            },
            [](LlamaConfig &self, py::object value) {
                // Accept both single int and list
                if (py::isinstance<py::int_>(value)) {
                    self.eos_token_id = {value.cast<int64_t>()};
                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
                    self.eos_token_id = value.cast<std::vector<int64_t>>();
                } else {
                    throw py::type_error("eos_token_id must be int or list of ints");
96
                }
Ceng's avatar
Ceng committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
            })
        .def("validate", &LlamaConfig::validate)
        .def("kv_dim", &LlamaConfig::kv_dim)
        // Add __dir__ to make attributes discoverable via dir() in Python
        .def("__dir__", [](const LlamaConfig &self) {
            py::list dir_list;
            dir_list.append("vocab_size");
            dir_list.append("hidden_size");
            dir_list.append("intermediate_size");
            dir_list.append("num_hidden_layers");
            dir_list.append("num_attention_heads");
            dir_list.append("num_key_value_heads");
            dir_list.append("head_dim");
            dir_list.append("max_position_embeddings");
            dir_list.append("rms_norm_eps");
            dir_list.append("hidden_act");
            dir_list.append("model_type");
            dir_list.append("rope_theta");
            dir_list.append("attention_bias");
            dir_list.append("attention_output_bias");
            dir_list.append("mlp_bias");
            dir_list.append("tie_word_embeddings");
            dir_list.append("use_cache");
            dir_list.append("attention_dropout");
            dir_list.append("initializer_range");
            dir_list.append("pretraining_tp");
            dir_list.append("name_or_path");
            dir_list.append("pad_token_id");
            dir_list.append("bos_token_id");
            dir_list.append("eos_token_id");
            dir_list.append("validate");
            dir_list.append("kv_dim");
            return dir_list;
        });
131

Ceng's avatar
Ceng committed
132
    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
133
134
135
136

    // Bind LlamaForCausalLM
    py::class_<LlamaForCausalLM, std::shared_ptr<LlamaForCausalLM>>(m, "LlamaForCausalLM")
        .def(py::init([](const LlamaConfig &config, const Device &device, py::object dtype_obj) {
137
138
139
140
141
142
143
144
145
146
147
148
                 infinicore::DataType dtype = infinicore::DataType::F32;
                 if (!dtype_obj.is_none()) {
                     // Extract dtype from Python object
                     if (py::hasattr(dtype_obj, "_underlying")) {
                         dtype = dtype_obj.attr("_underlying").cast<infinicore::DataType>();
                     } else {
                         dtype = dtype_obj.cast<infinicore::DataType>();
                     }
                 }
                 return std::make_shared<LlamaForCausalLM>(config, device, dtype);
             }),
             py::arg("config"), py::arg("device"), py::arg("dtype") = py::none())
149
        .def("state_dict", [](const LlamaForCausalLM &model) {
150
            // Return a dictionary containing references to the whole state of the module.
151
152
153
            auto state_dict = model.state_dict();
            py::dict result;
            for (const auto &[name, param] : state_dict) {
154
                result[py::cast(name)] = infinicore::Tensor(param);
155
156
157
            }
            return result;
        })
158
        .def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
159
160
161
162
163
164
165
166
                // Get actual tensor parameter by name
                auto state_dict = model.state_dict();
                auto it = state_dict.find(name);
                if (it != state_dict.end()) {
                    // Parameter inherits from Tensor, cast to Tensor for pybind11
                    const infinicore::Tensor &tensor = it->second;
                    return tensor;
                }
167
                throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
Ceng's avatar
Ceng committed
168
        .def("load_state_dict", [](LlamaForCausalLM &model, py::dict state_dict) {
169
170
171
172
173
                // Convert Python dict to C++ state_dict
                std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
                for (auto item : state_dict) {
                    std::string key = item.first.cast<std::string>();
                    py::object value = item.second.cast<py::object>();
Ceng's avatar
Ceng committed
174
175
176
177
178
179
180
181
                    // Extract InfiniCore tensor from Python object
                    infinicore::Tensor tensor;
                    if (py::hasattr(value, "_underlying")) {
                        tensor = value.attr("_underlying").cast<infinicore::Tensor>();
                    } else {
                        tensor = value.cast<infinicore::Tensor>();
                    }
                    cpp_state_dict.emplace(key, tensor);
182
                }
Ceng's avatar
Ceng committed
183
                model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"))
184
        .def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
Ceng's avatar
Ceng committed
185
186
187
188
189
190
191
192
        .def(
            "reset_cache", [](const LlamaForCausalLM &model, size_t pos = 0) {
            // Reset the internal cache to prevent state from persisting between generations
            model.model().reset_cache(pos);
        }, py::arg("pos") = 0, "Reset the internal cache to a specific position (clears state between generations)")
        .def("forward", [](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_cache = py::none()) {
                // Helper to extract C++ tensor from Python InfiniCore tensor
                auto get_tensor = [](py::object obj) -> infinicore::Tensor {
193
194
195
196
197
                    // If it's already a Python InfiniCore tensor wrapper, extract underlying
                    if (py::hasattr(obj, "_underlying")) {
                        return obj.attr("_underlying").cast<infinicore::Tensor>();
                    }
                    // Try direct cast (in case it's already a C++ tensor)
Ceng's avatar
Ceng committed
198
                    return obj.cast<infinicore::Tensor>();
199
                };
200

Ceng's avatar
Ceng committed
201
                // Extract InfiniCore tensors from Python objects
202
203
                auto infini_input_ids = get_tensor(input_ids);
                auto infini_position_ids = get_tensor(position_ids);
204

Ceng's avatar
Ceng committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
            // Handle kv_cache if provided (model-level DynamicCache)
            void *kv_cache_ptr = nullptr;
            if (!kv_cache.is_none()) {
                // Try to extract DynamicCache from Python object
                if (py::hasattr(kv_cache, "_underlying")) {
                    kv_cache_ptr = kv_cache.attr("_underlying").cast<void *>();
                } else {
                    // Try direct cast
                    try {
                        kv_cache_ptr = kv_cache.cast<void *>();
                    } catch (...) {
                        // If conversion fails, pass nullptr (cache will be ignored)
                        kv_cache_ptr = nullptr;
                    }
                }
            }
221

Ceng's avatar
Ceng committed
222
223
            return model.forward(infini_input_ids, infini_position_ids, kv_cache_ptr);
        }, py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
224
225
226
}

} // namespace infinilm::models::llama