llama.hpp 10.5 KB
Newer Older
1
2
3
#pragma once

#include "../../cache/kv_cache.hpp"
4
5
6
#include "../../models/debug_utils/hooks.hpp"
#include "../../models/llama/llama.hpp"
#include "../../models/llama/llama_attention.hpp"
7
8
#include "infinicore/device.hpp"
#include "infinicore/nn/module.hpp"
9
10
11
12
#include "infinicore/tensor.hpp"
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
13
14
15
16
17
18
19
20
21
22
23
24

namespace py = pybind11;
using infinicore::Device;
using infinilm::models::debug_utils::HookRegistry;

namespace infinilm::models::llama {

inline void bind_llama(py::module &m) {
    // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
    // Bind HookRegistry
    py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
        .def(py::init<>())
25
26
27
28
29
30
31
32
33
34
35
36
37
38
        .def(
            "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
                // Convert Python callable to C++ function
                self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
                    try {
                        // Call Python callback with hook name, tensor, and layer index
                        callback(hook_name, tensor, layer_idx);
                    } catch (const py::error_already_set &e) {
                        // Re-raise Python exception
                        throw;
                    }
                });
            },
            py::arg("name"), py::arg("callback"))
39
40
41
        .def("clear", &HookRegistry::clear)
        .def("has_hooks", &HookRegistry::has_hooks);

Jiacheng Huang's avatar
Jiacheng Huang committed
42
43
    py::class_<InfinilmModel::Config> config(m, "Config");

44
    // Bind LlamaConfig
Jiacheng Huang's avatar
Jiacheng Huang committed
45
46
    py::class_<LlamaConfig, InfinilmModel::Config> llama_config(m, "LlamaConfig");
    llama_config
47
48
49
50
51
52
53
54
55
56
57
58
59
60
        .def(py::init<>())
        .def_readwrite("vocab_size", &LlamaConfig::vocab_size)
        .def_readwrite("hidden_size", &LlamaConfig::hidden_size)
        .def_readwrite("intermediate_size", &LlamaConfig::intermediate_size)
        .def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers)
        .def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads)
        .def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads)
        .def_readwrite("head_dim", &LlamaConfig::head_dim)
        .def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings)
        .def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps)
        .def_readwrite("hidden_act", &LlamaConfig::hidden_act)
        .def_readwrite("model_type", &LlamaConfig::model_type)
        .def_readwrite("rope_theta", &LlamaConfig::rope_theta)
        .def_readwrite("attention_bias", &LlamaConfig::attention_bias)
Ceng's avatar
Ceng committed
61
        .def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias)
62
63
64
        .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
        .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
        .def_readwrite("use_cache", &LlamaConfig::use_cache)
Ceng's avatar
Ceng committed
65
66
67
68
        .def_readwrite("attention_dropout", &LlamaConfig::attention_dropout)
        .def_readwrite("initializer_range", &LlamaConfig::initializer_range)
        .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
        .def_readwrite("name_or_path", &LlamaConfig::name_or_path)
69
        .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
Your Name's avatar
Your Name committed
70
        .def_property("bos_token_id", [](const LlamaConfig &self) {
Ceng's avatar
Ceng committed
71
                // Always return as list to match Python config format
Your Name's avatar
Your Name committed
72
                return py::cast(self.bos_token_id); }, [](LlamaConfig &self, py::object value) {
Ceng's avatar
Ceng committed
73
74
75
76
77
78
79
                // Accept both single int and list
                if (py::isinstance<py::int_>(value)) {
                    self.bos_token_id = {value.cast<int64_t>()};
                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
                    self.bos_token_id = value.cast<std::vector<int64_t>>();
                } else {
                    throw py::type_error("bos_token_id must be int or list of ints");
Your Name's avatar
Your Name committed
80
81
                } })
        .def_property("eos_token_id", [](const LlamaConfig &self) {
Ceng's avatar
Ceng committed
82
                // Always return as list to match Python config format
Your Name's avatar
Your Name committed
83
                return py::cast(self.eos_token_id); }, [](LlamaConfig &self, py::object value) {
Ceng's avatar
Ceng committed
84
85
86
87
88
89
90
                // Accept both single int and list
                if (py::isinstance<py::int_>(value)) {
                    self.eos_token_id = {value.cast<int64_t>()};
                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
                    self.eos_token_id = value.cast<std::vector<int64_t>>();
                } else {
                    throw py::type_error("eos_token_id must be int or list of ints");
Your Name's avatar
Your Name committed
91
                } })
Ceng's avatar
Ceng committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
        .def("validate", &LlamaConfig::validate)
        .def("kv_dim", &LlamaConfig::kv_dim)
        // Add __dir__ to make attributes discoverable via dir() in Python
        .def("__dir__", [](const LlamaConfig &self) {
            py::list dir_list;
            dir_list.append("vocab_size");
            dir_list.append("hidden_size");
            dir_list.append("intermediate_size");
            dir_list.append("num_hidden_layers");
            dir_list.append("num_attention_heads");
            dir_list.append("num_key_value_heads");
            dir_list.append("head_dim");
            dir_list.append("max_position_embeddings");
            dir_list.append("rms_norm_eps");
            dir_list.append("hidden_act");
            dir_list.append("model_type");
            dir_list.append("rope_theta");
            dir_list.append("attention_bias");
            dir_list.append("attention_output_bias");
            dir_list.append("mlp_bias");
            dir_list.append("tie_word_embeddings");
            dir_list.append("use_cache");
            dir_list.append("attention_dropout");
            dir_list.append("initializer_range");
            dir_list.append("pretraining_tp");
            dir_list.append("name_or_path");
            dir_list.append("pad_token_id");
            dir_list.append("bos_token_id");
            dir_list.append("eos_token_id");
            dir_list.append("validate");
            dir_list.append("kv_dim");
Your Name's avatar
Your Name committed
123
            return dir_list; });
124

Ceng's avatar
Ceng committed
125
    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
126
127
128
129

    // Bind LlamaForCausalLM
    py::class_<LlamaForCausalLM, std::shared_ptr<LlamaForCausalLM>>(m, "LlamaForCausalLM")
        .def("state_dict", [](const LlamaForCausalLM &model) {
130
            // Return a dictionary containing references to the whole state of the module.
131
132
133
            auto state_dict = model.state_dict();
            py::dict result;
            for (const auto &[name, param] : state_dict) {
134
                result[py::cast(name)] = infinicore::Tensor(param);
135
136
137
            }
            return result;
        })
138
        .def("get_parameter", [](const LlamaForCausalLM &model, const std::string &name) {
139
140
141
142
143
144
145
146
                // Get actual tensor parameter by name
                auto state_dict = model.state_dict();
                auto it = state_dict.find(name);
                if (it != state_dict.end()) {
                    // Parameter inherits from Tensor, cast to Tensor for pybind11
                    const infinicore::Tensor &tensor = it->second;
                    return tensor;
                }
147
                throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
Ceng's avatar
Ceng committed
148
        .def("load_state_dict", [](LlamaForCausalLM &model, py::dict state_dict) {
149
150
151
152
153
                // Convert Python dict to C++ state_dict
                std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
                for (auto item : state_dict) {
                    std::string key = item.first.cast<std::string>();
                    py::object value = item.second.cast<py::object>();
Ceng's avatar
Ceng committed
154
155
156
157
158
159
160
161
                    // Extract InfiniCore tensor from Python object
                    infinicore::Tensor tensor;
                    if (py::hasattr(value, "_underlying")) {
                        tensor = value.attr("_underlying").cast<infinicore::Tensor>();
                    } else {
                        tensor = value.cast<infinicore::Tensor>();
                    }
                    cpp_state_dict.emplace(key, tensor);
162
                }
Ceng's avatar
Ceng committed
163
                model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"))
164
        .def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
Your Name's avatar
Your Name committed
165
        .def("reset_cache", [](const LlamaForCausalLM &model, size_t pos = 0) {
Ceng's avatar
Ceng committed
166
            // Reset the internal cache to prevent state from persisting between generations
Your Name's avatar
Your Name committed
167
            model.model().reset_cache(pos); }, py::arg("pos") = 0, "Reset the internal cache to a specific position (clears state between generations)")
Ceng's avatar
Ceng committed
168
169
170
        .def("forward", [](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_cache = py::none()) {
                // Helper to extract C++ tensor from Python InfiniCore tensor
                auto get_tensor = [](py::object obj) -> infinicore::Tensor {
171
172
173
174
175
                    // If it's already a Python InfiniCore tensor wrapper, extract underlying
                    if (py::hasattr(obj, "_underlying")) {
                        return obj.attr("_underlying").cast<infinicore::Tensor>();
                    }
                    // Try direct cast (in case it's already a C++ tensor)
Ceng's avatar
Ceng committed
176
                    return obj.cast<infinicore::Tensor>();
177
                };
178

Ceng's avatar
Ceng committed
179
                // Extract InfiniCore tensors from Python objects
180
181
                auto infini_input_ids = get_tensor(input_ids);
                auto infini_position_ids = get_tensor(position_ids);
182

Ceng's avatar
Ceng committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
            // Handle kv_cache if provided (model-level DynamicCache)
            void *kv_cache_ptr = nullptr;
            if (!kv_cache.is_none()) {
                // Try to extract DynamicCache from Python object
                if (py::hasattr(kv_cache, "_underlying")) {
                    kv_cache_ptr = kv_cache.attr("_underlying").cast<void *>();
                } else {
                    // Try direct cast
                    try {
                        kv_cache_ptr = kv_cache.cast<void *>();
                    } catch (...) {
                        // If conversion fails, pass nullptr (cache will be ignored)
                        kv_cache_ptr = nullptr;
                    }
                }
            }
199

Your Name's avatar
Your Name committed
200
            return model.forward(infini_input_ids, infini_position_ids, kv_cache_ptr); }, py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
201
202
203
}

} // namespace infinilm::models::llama