#include "model.h" #include "factoryllm.h" namespace pyfastllm{ // 对接不断更新的后端接口 // 需优化,减少内存拷贝 fastllm::Data RMSNorm(const fastllm::Data &input, const fastllm::Data &weight, float eps){ fastllm::Data output; // std::cout<<"run rms norm"< 0) ss += "\n"; for (int j = 0; j < 10 && j < m; j++) { if (j>0) ss += " "; ss += std::to_string(reinterpret_cast(data.cpuData)[i*m+j]); } if (m > 10) { ss += "... "; for (int j = 0; j < 10 && j < m; j++) { if (j>0) ss += " "; ss += std::to_string(reinterpret_cast(data.cpuData)[i*m + (m-10+j)]); } } } ss += "]"; return ss; } } #ifdef PY_API #include #include #include #include #include namespace py = pybind11; using namespace pybind11::literals; // template // using overload_cast_ = pybind11::detail::overload_cast_impl; using pastKV = std::vector>; // PYBIND11_MAKE_OPAQUE(std::vector>); PYBIND11_MAKE_OPAQUE(fastllm::Data); PYBIND11_MODULE(pyfastllm, m) { m.doc() = "fastllm python bindings"; py::class_(m, "GenerationConfig") .def(py::init<>()) .def_readwrite("max_length", &fastllm::GenerationConfig::output_token_limit) .def_readwrite("last_n", &fastllm::GenerationConfig::last_n) .def_readwrite("repeat_penalty", &fastllm::GenerationConfig::repeat_penalty) .def_readwrite("top_k", &fastllm::GenerationConfig::top_k) .def_readwrite("top_p", &fastllm::GenerationConfig::top_p) .def_readwrite("temperature", &fastllm::GenerationConfig::temperature) .def_readwrite("enable_hash_id", &fastllm::GenerationConfig::enable_hash_id) .def("is_simple_greedy", &fastllm::GenerationConfig::IsSimpleGreedy); // high level m.def("set_threads", &fastllm::SetThreads) .def("get_threads", &fastllm::GetThreads) .def("set_low_memory", &fastllm::SetLowMemMode) .def("get_low_memory", &fastllm::GetLowMemMode) .def("set_kv_cache", &fastllm::SetKVCacheInCPU) .def("get_kv_cache", &fastllm::GetKVCacheInCPU) .def("set_device_map", &fastllm::SetDeviceMap) .def("create_llm", &fastllm::CreateLLMModelFromFile); m.def("std_hash", [](std::string input) -> size_t { return std::hash{}(input); }); // low level m.def("get_llm_type", &fastllm::GetModelTypeFromFile); m.def("llm_sampling", &fastllm::LLMSampling) // .def("embedding", &fastllm::Embedding) .def("rms_norm", &pyfastllm::RMSNorm) .def("layer_norm", &pyfastllm::LayerNorm) .def("linear", &pyfastllm::Linear) // .def("split", &fastllm::Split) // .def("cat", &fastllm::Cat) // .def("cat_direct", &fastllm::CatDirect) .def("matmul", &pyfastllm::MatMul) // .def("matmul_transB", &fastllm::MatMulTransB) .def("softmax", &pyfastllm::Softmax) .def("silu", &pyfastllm::Silu) .def("gelu", &pyfastllm::Gelu) .def("swiglu", &pyfastllm::Swiglu) .def("mul", &pyfastllm::Mul) .def("attention", &pyfastllm::Attention); // .def("mul_to", &fastllm::MulTo) // .def("add_to", &fastllm::AddTo) // .def("attention_mask", &fastllm::AttentionMask) // .def("alibi_mask", &fastllm::AlibiMask) // .def("permute", &fastllm::Permute) // .def("permute_self", &fastllm::PermuteSelf) // .def("topk", &fastllm::TopK) // .def("rotateposition2D", &fastllm::RotatePosition2D) // .def("nearlyrotateposition2D", &fastllm::NearlyRotatePosition2D) // .def("llama_rotateposition2D", &fastllm::LlamaRotatePosition2D) // .def("repeat_penalty", &fastllm::RepeatPenalty); py::enum_(m, "Dtype") .value("float32", fastllm::DataType::FLOAT32) .value("bfloat16", fastllm::DataType::BFLOAT16) .value("int16", fastllm::DataType::INT16) .value("int8", fastllm::DataType::INT8) .value("int4", fastllm::DataType::INT4) .value("int2", fastllm::DataType::INT2) .value("float16", fastllm::DataType::FLOAT16) .value("bit", fastllm::DataType::BIT) .value("int32param", fastllm::DataType::INT32PARAM) .export_values(); py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer([](fastllm::Data &m) -> py::buffer_info { return py::buffer_info( m.cpuData, /* Pointer to buffer */ sizeof(float), /* Size of one scalar */ py::format_descriptor::format(), /* Python struct-style format descriptor */ m.dims.size(), /* Number of dimensions */ m.dims, /* Buffer dimensions */ { sizeof(float) * m.dims[1], /* Strides (in bytes) for each index */ sizeof(float) } ); }) .def_readonly("dims", &fastllm::Data::dims) .def(py::init<>()) .def(py::init()) .def(py::init&>()) .def(py::init&, const std::vector&>()) .def(py::init()) .def_readonly("shape", &fastllm::Data::dims) .def("copy_from", &fastllm::Data::CopyFrom) .def("count", &fastllm::Data::Count) .def("to_list", [](fastllm::Data& data){ std::vector vecData; for (int i = 0; i < data.Count(0); i++) { vecData.push_back(((float*)data.cpuData)[i]); } return vecData; }) .def("__str__", &pyfastllm::String) .def("print", &fastllm::Data::Print) .def("to", static_cast(&fastllm::Data::ToDevice)); m.def("zeros", [](const std::vector &dims, fastllm::DataType dtype)->fastllm::Data { int nums = 1; for (auto dim:dims){nums *= dim; } std::vectorzero_data(nums, 0); auto data = fastllm::Data(dtype, dims, zero_data); return data; }, py::arg("dims"), py::arg("dtype")); m.def("cat", [](std::vector datas, int dim)->fastllm::Data { // int pos_dim = 0; // // dim check // for (int i=0;i vecData; for (auto data:datas){ for (int i = 0; i < data.Count(0); i++) { vecData.push_back(((float*)data.cpuData)[i]); } } int seqLen = vecData.size(); return fastllm::Data(fastllm::DataType::FLOAT32, {1, seqLen}, vecData); }); py::class_(m, "Tokenizer") .def("encode", &fastllm::Tokenizer::Encode) // .def("decode", &fastllm::Tokenizer::Decode) .def("decode", &fastllm::Tokenizer::Decode, "Decode from Tensor") .def("decode", &fastllm::Tokenizer::DecodeTokens, "Decode from Vector") .def("decode_byte", [](fastllm::Tokenizer &tokenizer, const fastllm::Data &data){ std::string ret = tokenizer.Decode(data); return py::bytes(ret); }) .def("decode_byte", [](fastllm::Tokenizer &tokenizer, const std::vector& data){ std::string ret = tokenizer.DecodeTokens(data); return py::bytes(ret); }) .def("clear", &fastllm::Tokenizer::Clear) .def("insert", &fastllm::Tokenizer::Insert); py::class_(m, "WeightMap") .def_readonly("tokenizer", &fastllm::WeightMap::tokenizer) .def("save_lowbit", &fastllm::WeightMap::SaveLowBitModel) .def("set_kv", &fastllm::WeightMap::AddDict) .def("set_weight", &fastllm::WeightMap::AddWeight) .def("__getitem__", [](fastllm::WeightMap &weight, std::string key){ return weight[key]; }); // model classes py::class_(m, "basellm"); py::class_(m, "ChatGLMModel") .def(py::init<>()) .def_readonly("model_type", &fastllm::ChatGLMModel::model_type) .def_readonly("weight", &fastllm::ChatGLMModel::weight) .def_readonly("block_cnt", &fastllm::ChatGLMModel::block_cnt) .def_readonly("bos_token_id", &fastllm::ChatGLMModel::bos_token_id) .def_readonly("eos_token_id", &fastllm::ChatGLMModel::eos_token_id) .def_readonly("gmask_token_id", &fastllm::ChatGLMModel::gmask_token_id) .def("load_weights", &fastllm::ChatGLMModel::LoadFromFile) .def("make_input", &fastllm::ChatGLMModel::MakeInput) .def("make_history", &fastllm::ChatGLMModel::MakeHistory) .def("response", &fastllm::ChatGLMModel::Response) .def("batch_response", [](fastllm::ChatGLMModel &model, const std::vector &inputs, RuntimeResultBatch retCb, fastllm::GenerationConfig config)->std::vector { std::vector outputs; model.ResponseBatch(inputs, outputs, retCb, config); return outputs; }) .def("warmup", &fastllm::ChatGLMModel::WarmUp) .def("forward", [](fastllm::ChatGLMModel &model, const fastllm::Data &inputIds, const fastllm::Data &attentionMask, const fastllm::Data &positionIds, std::vector> &pastKeyValues, const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) { int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens); return std::make_tuple(retV, pastKeyValues); }) .def("launch_response", &fastllm::ChatGLMModel::LaunchResponseTokens) .def("fetch_response", &fastllm::ChatGLMModel::FetchResponseTokens) .def("save_lowbit_model", &fastllm::ChatGLMModel::SaveLowBitModel) .def("make_input", &fastllm::ChatGLMModel::MakeInput); py::class_(m, "MOSSModel") .def(py::init<>()) .def_readonly("model_type", &fastllm::MOSSModel::model_type) .def_readonly("weight", &fastllm::MOSSModel::weight) .def_readonly("block_cnt", &fastllm::MOSSModel::block_cnt) .def_readonly("bos_token_id", &fastllm::MOSSModel::bos_token_id) .def_readonly("eos_token_id", &fastllm::MOSSModel::eos_token_id) .def("load_weights", &fastllm::MOSSModel::LoadFromFile) .def("make_input", &fastllm::MOSSModel::MakeInput) .def("make_history", &fastllm::MOSSModel::MakeHistory) .def("response", &fastllm::MOSSModel::Response) .def("batch_response", [](fastllm::MOSSModel &model, const std::vector &inputs, RuntimeResultBatch retCb, fastllm::GenerationConfig config)->std::vector { std::vector outputs; model.ResponseBatch(inputs, outputs, retCb, config); return outputs; }) .def("forward", [](fastllm::MOSSModel &model, const fastllm::Data &inputIds, const fastllm::Data &attentionMask, const fastllm::Data &positionIds, std::vector> &pastKeyValues, const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) { int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens); return std::make_tuple(retV, pastKeyValues); }) .def("launch_response", &fastllm::MOSSModel::LaunchResponseTokens) .def("fetch_response", &fastllm::MOSSModel::FetchResponseTokens) .def("save_lowbit_model", &fastllm::MOSSModel::SaveLowBitModel) .def("make_input", &fastllm::MOSSModel::MakeInput); py::class_(m, "LlamaModel") .def(py::init<>()) .def_readonly("model_type", &fastllm::LlamaModel::model_type) .def_readonly("weight", &fastllm::LlamaModel::weight) .def_readonly("block_cnt", &fastllm::LlamaModel::block_cnt) .def_readonly("bos_token_id", &fastllm::LlamaModel::bos_token_id) .def_readonly("eos_token_id", &fastllm::LlamaModel::eos_token_id) .def("load_weights", &fastllm::LlamaModel::LoadFromFile) .def("make_input", &fastllm::LlamaModel::MakeInput) .def("make_history", &fastllm::LlamaModel::MakeHistory) .def("response", &fastllm::LlamaModel::Response) .def("batch_response", [](fastllm::LlamaModel &model, const std::vector &inputs, RuntimeResultBatch retCb, fastllm::GenerationConfig config)->std::vector { std::vector outputs; model.ResponseBatch(inputs, outputs, retCb, config); return outputs; }) .def("warmup", &fastllm::LlamaModel::WarmUp) .def("forward", [](fastllm::LlamaModel &model, const fastllm::Data &inputIds, const fastllm::Data &attentionMask, const fastllm::Data &positionIds, std::vector> &pastKeyValues, const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) { int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens); return std::make_tuple(retV, pastKeyValues); }) .def("launch_response", &fastllm::LlamaModel::LaunchResponseTokens) .def("fetch_response", &fastllm::LlamaModel::FetchResponseTokens) .def("save_lowbit_model", &fastllm::LlamaModel::SaveLowBitModel) .def("make_input", &fastllm::LlamaModel::MakeInput); py::class_(m, "QWenModel") .def(py::init<>()) .def_readonly("model_type", &fastllm::QWenModel::model_type) .def_readonly("weight", &fastllm::QWenModel::weight) .def_readonly("block_cnt", &fastllm::QWenModel::block_cnt) .def_readonly("bos_token_id", &fastllm::QWenModel::bos_token_id) .def_readonly("eos_token_id", &fastllm::QWenModel::eos_token_id) .def("load_weights", &fastllm::QWenModel::LoadFromFile) .def("make_input", &fastllm::QWenModel::MakeInput) .def("make_history", &fastllm::QWenModel::MakeHistory) .def("response", &fastllm::QWenModel::Response) .def("batch_response", [](fastllm::QWenModel &model, const std::vector &inputs, RuntimeResultBatch retCb, fastllm::GenerationConfig config)->std::vector { std::vector outputs; model.ResponseBatch(inputs, outputs, retCb, config); return outputs; }) .def("warmup", &fastllm::QWenModel::WarmUp) .def("forward", [](fastllm::QWenModel &model, const fastllm::Data &inputIds, const fastllm::Data &attentionMask, const fastllm::Data &positionIds, std::vector> &pastKeyValues, const fastllm::GenerationConfig &generationConfig, const fastllm::LastTokensManager &tokens) { int retV = model.Forward(inputIds, attentionMask, positionIds, pastKeyValues, generationConfig, tokens); return std::make_tuple(retV, pastKeyValues); }) .def("launch_response", &fastllm::QWenModel::LaunchResponseTokens) .def("fetch_response", &fastllm::QWenModel::FetchResponseTokens) .def("save_lowbit_model", &fastllm::QWenModel::SaveLowBitModel) .def("make_input", &fastllm::QWenModel::MakeInput); #ifdef VERSION_INFO m.attr("__version__") = VERSION_INFO; #else m.attr("__version__") = "dev"; #endif } #endif