Commit 38ac084d authored by PanZezhong's avatar PanZezhong
Browse files

issue/95 将pybind target命名为_infinilm

parent ae3ebe19
#pragma once #pragma once
#include "infinicore/tensor.hpp"
#include "infinicore/device.hpp" #include "infinicore/device.hpp"
#include "infinicore/tensor.hpp"
#include <algorithm> #include <algorithm>
#include <utility>
#include <memory> #include <memory>
#include <utility>
namespace infinilm::cache { namespace infinilm::cache {
...@@ -18,11 +18,11 @@ namespace infinilm::cache { ...@@ -18,11 +18,11 @@ namespace infinilm::cache {
* that needs KV caching for attention mechanisms. * that needs KV caching for attention mechanisms.
*/ */
struct KVCache { struct KVCache {
infinicore::Tensor k_cache; // [n_kv_head, capacity, head_dim] infinicore::Tensor k_cache; // [n_kv_head, capacity, head_dim]
infinicore::Tensor v_cache; // [n_kv_head, capacity, head_dim] infinicore::Tensor v_cache; // [n_kv_head, capacity, head_dim]
size_t cache_position; // Current position in cache size_t cache_position; // Current position in cache
size_t max_capacity; // Maximum capacity of cache size_t max_capacity; // Maximum capacity of cache
bool initialized; // Whether cache has been initialized bool initialized; // Whether cache has been initialized
KVCache() KVCache()
: cache_position(0), max_capacity(0), initialized(false), : cache_position(0), max_capacity(0), initialized(false),
...@@ -41,12 +41,12 @@ struct KVCache { ...@@ -41,12 +41,12 @@ struct KVCache {
* @param device Device * @param device Device
*/ */
void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len, void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len,
infinicore::DataType dtype, const infinicore::Device &device) { infinicore::DataType dtype, const infinicore::Device &device) {
size_t required_capacity = cache_position + seq_len; size_t required_capacity = cache_position + seq_len;
// Lazy initialization // Lazy initialization
if (!initialized) { if (!initialized) {
max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096 max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim}, k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
dtype, device); dtype, device);
v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim}, v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
...@@ -94,7 +94,7 @@ struct KVCache { ...@@ -94,7 +94,7 @@ struct KVCache {
// Ensure capacity // Ensure capacity
ensure_capacity(num_kv_heads, head_dim, seq_len, ensure_capacity(num_kv_heads, head_dim, seq_len,
k_new->dtype(), k_new->device()); k_new->dtype(), k_new->device());
// Copy new k/v into cache at current position // Copy new k/v into cache at current position
auto k_dst = k_cache->narrow({{1, cache_position, seq_len}}); auto k_dst = k_cache->narrow({{1, cache_position, seq_len}});
...@@ -113,4 +113,4 @@ struct KVCache { ...@@ -113,4 +113,4 @@ struct KVCache {
} }
}; };
} // namespace infinilm::models::common } // namespace infinilm::cache
...@@ -3,17 +3,17 @@ ...@@ -3,17 +3,17 @@
#include "infinicore/nn/rope.hpp" #include "infinicore/nn/rope.hpp"
#include "infinicore/ops.hpp" #include "infinicore/ops.hpp"
#include "infinicore/ops/mul.hpp" #include "infinicore/ops/mul.hpp"
#include <spdlog/spdlog.h> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <stdexcept>
#include <iostream> #include <iostream>
#include <algorithm> #include <spdlog/spdlog.h>
#include <stdexcept>
namespace infinilm::models::llama { namespace infinilm::models::llama {
LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device, LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
infinicore::DataType dtype) infinicore::DataType dtype)
: hidden_size_(config.hidden_size), : hidden_size_(config.hidden_size),
num_attention_heads_(config.num_attention_heads), num_attention_heads_(config.num_attention_heads),
num_key_value_heads_(config.num_key_value_heads), num_key_value_heads_(config.num_key_value_heads),
...@@ -22,19 +22,18 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Devi ...@@ -22,19 +22,18 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Devi
use_bias_(config.attention_bias) { use_bias_(config.attention_bias) {
// Initialize projection layers // Initialize projection layers
INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_, INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_,
dtype, device); dtype, device);
INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_, INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_,
dtype, device); dtype, device);
INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_, INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_,
dtype, device); dtype, device);
INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_, INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_,
dtype, device); dtype, device);
} }
infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states, infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
void *kv_cache) const { void *kv_cache) const {
if (!rotary_emb_) { if (!rotary_emb_) {
throw std::runtime_error("LlamaAttention: rotary_emb not configured"); throw std::runtime_error("LlamaAttention: rotary_emb not configured");
} }
...@@ -45,12 +44,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -45,12 +44,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
size_t seq_len = shape[1]; size_t seq_len = shape[1];
// 1. Project Q, K, V // 1. Project Q, K, V
auto q = q_proj_->forward(hidden_states_mutable); // [batch, seq_len, hidden_size] auto q = q_proj_->forward(hidden_states_mutable); // [batch, seq_len, hidden_size]
auto k = k_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
auto v = v_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim] auto k = k_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
auto v = v_proj_->forward(hidden_states_mutable); // [batch, seq_len, kv_dim]
// 2. Reshape for multi-head attention // 2. Reshape for multi-head attention
...@@ -84,7 +82,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -84,7 +82,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto q_for_rope = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_})->contiguous(); auto q_for_rope = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_})->contiguous();
auto k_for_rope = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_})->contiguous(); auto k_for_rope = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_})->contiguous();
// Call RoPE on full batch (matching Python pattern) // Call RoPE on full batch (matching Python pattern)
auto q_rope_out = rotary_emb_->forward(q_for_rope, pos_ids_for_rope); auto q_rope_out = rotary_emb_->forward(q_for_rope, pos_ids_for_rope);
auto k_rope_out = rotary_emb_->forward(k_for_rope, pos_ids_for_rope); auto k_rope_out = rotary_emb_->forward(k_for_rope, pos_ids_for_rope);
...@@ -98,8 +95,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -98,8 +95,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto output_tensor = infinicore::Tensor::empty( auto output_tensor = infinicore::Tensor::empty(
{batch_size, seq_len, hidden_size_}, {batch_size, seq_len, hidden_size_},
q->dtype(), q->dtype(),
q->device() q->device());
);
for (size_t b = 0; b < batch_size; ++b) { for (size_t b = 0; b < batch_size; ++b) {
// Extract batch item from RoPE output (already computed above for full batch) // Extract batch item from RoPE output (already computed above for full batch)
...@@ -110,13 +106,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -110,13 +106,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Convert to [n_head, seq_len, head_dim] for cache // Convert to [n_head, seq_len, head_dim] for cache
// Ensure contiguous after permute for F16 compatibility with cache operations // Ensure contiguous after permute for F16 compatibility with cache operations
auto q_rope = q_batch->permute({1, 0, 2})->contiguous(); // [n_q_head, seq_len, head_dim] auto q_rope = q_batch->permute({1, 0, 2})->contiguous(); // [n_q_head, seq_len, head_dim]
auto k_rope = k_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim] auto k_rope = k_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]
auto v_permuted = v_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim] auto v_permuted = v_batch->permute({1, 0, 2})->contiguous(); // [n_kv_head, seq_len, head_dim]
// 5. Prepare KV caches // 5. Prepare KV caches
infinicore::Tensor k_total = infinicore::Tensor::empty({1, 1, 1}, k_rope->dtype(), k_rope->device()); infinicore::Tensor k_total;
infinicore::Tensor v_total = infinicore::Tensor::empty({1, 1, 1}, v_permuted->dtype(), v_permuted->device()); infinicore::Tensor v_total;
if (external_cache != nullptr) { if (external_cache != nullptr) {
auto [k_total_tmp, v_total_tmp] = external_cache->update(k_rope, v_permuted); auto [k_total_tmp, v_total_tmp] = external_cache->update(k_rope, v_permuted);
k_total = k_total_tmp; k_total = k_total_tmp;
...@@ -136,11 +132,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -136,11 +132,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Extract from KV cache (k_total and v_total are [n_kv_head, total_seq_len, head_dim]) // Extract from KV cache (k_total and v_total are [n_kv_head, total_seq_len, head_dim])
// Python: key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim)) // Python: key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
// Python's narrow+view ensures contiguous memory, so we need to ensure contiguous before permute // Python's narrow+view ensures contiguous memory, so we need to ensure contiguous before permute
auto k_for_attn = k_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim] auto k_for_attn = k_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
auto v_for_attn = v_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim] auto v_for_attn = v_total->permute({1, 0, 2}); // [total_seq_len, n_kv_head, head_dim]
// q_batch is already [seq_len, n_q_head, head_dim] from above // q_batch is already [seq_len, n_q_head, head_dim] from above
auto q_for_attn = q_batch; // [seq_len, n_q_head, head_dim] auto q_for_attn = q_batch; // [seq_len, n_q_head, head_dim]
// Python: grouped_query_attention calls repeat_kv if ngroup > 1 // Python: grouped_query_attention calls repeat_kv if ngroup > 1
// Python: repeat_kv expands [total_seq_len, num_key_value_heads, head_dim] -> [total_seq_len, num_attention_heads, head_dim] // Python: repeat_kv expands [total_seq_len, num_key_value_heads, head_dim] -> [total_seq_len, num_attention_heads, head_dim]
...@@ -154,15 +150,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -154,15 +150,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto k_strides = k_for_attn->strides(); auto k_strides = k_for_attn->strides();
auto k_strided = k_for_attn->as_strided( auto k_strided = k_for_attn->as_strided(
{total_seq_len, n_kv_head, ngroup, head_dim}, {total_seq_len, n_kv_head, ngroup, head_dim},
{k_strides[0], k_strides[1], 0, k_strides[2]} {k_strides[0], k_strides[1], 0, k_strides[2]});
);
k_for_attn = k_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim}); k_for_attn = k_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
auto v_strides = v_for_attn->strides(); auto v_strides = v_for_attn->strides();
auto v_strided = v_for_attn->as_strided( auto v_strided = v_for_attn->as_strided(
{total_seq_len, n_kv_head, ngroup, head_dim}, {total_seq_len, n_kv_head, ngroup, head_dim},
{v_strides[0], v_strides[1], 0, v_strides[2]} {v_strides[0], v_strides[1], 0, v_strides[2]});
);
v_for_attn = v_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim}); v_for_attn = v_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
} }
...@@ -170,26 +164,25 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -170,26 +164,25 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Python: Q = querys.permute((1, 0, 2)) # [num_heads, seq_len, head_dim] // Python: Q = querys.permute((1, 0, 2)) # [num_heads, seq_len, head_dim]
// Python: K = keys # [total_seq_len, num_heads, head_dim] (NO permute!) // Python: K = keys # [total_seq_len, num_heads, head_dim] (NO permute!)
// Python: V = values.permute((1, 0, 2)) # [num_heads, total_seq_len, head_dim] // Python: V = values.permute((1, 0, 2)) # [num_heads, total_seq_len, head_dim]
auto Q = q_for_attn->permute({1, 0, 2}); // [n_q_head, seq_len, head_dim] auto Q = q_for_attn->permute({1, 0, 2}); // [n_q_head, seq_len, head_dim]
auto K = k_for_attn; // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python) auto K = k_for_attn; // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
auto V = v_for_attn->permute({1, 0, 2}); // [n_q_head, total_seq_len, head_dim] auto V = v_for_attn->permute({1, 0, 2}); // [n_q_head, total_seq_len, head_dim]
// Python: attn_weight = Q @ K.permute((1, 2, 0)) // Python: attn_weight = Q @ K.permute((1, 2, 0))
// Python: K.permute((1, 2, 0)) transforms [total_seq_len, num_heads, head_dim] -> [num_heads, head_dim, total_seq_len] // Python: K.permute((1, 2, 0)) transforms [total_seq_len, num_heads, head_dim] -> [num_heads, head_dim, total_seq_len]
auto K_transposed = K->permute({1, 2, 0}); // [n_q_head, head_dim, total_seq_len] auto K_transposed = K->permute({1, 2, 0}); // [n_q_head, head_dim, total_seq_len]
// Use GEMM with alpha=scaling to combine scaling with matrix multiplication // Use GEMM with alpha=scaling to combine scaling with matrix multiplication
// This is more efficient than doing matmul followed by mul // This is more efficient than doing matmul followed by mul
float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_)); float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_));
auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling); // [n_q_head, seq_len, total_seq_len] auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling); // [n_q_head, seq_len, total_seq_len]
infinicore::op::causal_softmax_(attn_weight, attn_weight); infinicore::op::causal_softmax_(attn_weight, attn_weight);
auto out = infinicore::op::matmul(attn_weight, V); // [n_q_head, seq_len, head_dim] auto out = infinicore::op::matmul(attn_weight, V); // [n_q_head, seq_len, head_dim]
// Python: return out.permute((1, 0, 2)).contiguous() # [seq_len, num_heads, head_dim] // Python: return out.permute((1, 0, 2)).contiguous() # [seq_len, num_heads, head_dim]
auto attn_output = out->permute({1, 0, 2})->contiguous(); // [seq_len, n_q_head, head_dim] auto attn_output = out->permute({1, 0, 2})->contiguous(); // [seq_len, n_q_head, head_dim]
// Python: attn_output_i.copy_(attention_i) // Python: attn_output_i.copy_(attention_i)
// Python: attn_output = attn_output.view(hidden_states_shape) # [bs, seq_len, hidden_size] // Python: attn_output = attn_output.view(hidden_states_shape) # [bs, seq_len, hidden_size]
......
#include <pybind11/pybind11.h>
#include "models/llama.hpp" #include "models/llama.hpp"
#include <pybind11/pybind11.h>
namespace py = pybind11; namespace py = pybind11;
PYBIND11_MODULE(_infinilm_llama, m) { PYBIND11_MODULE(_infinilm, m) {
m.doc() = "InfiniLM Llama model Python bindings"; m.doc() = "InfiniLM Llama model Python bindings";
infinilm::models::llama::bind_llama(m); infinilm::models::llama::bind_llama(m);
......
...@@ -14,6 +14,6 @@ if str(_lib_dir) not in sys.path: ...@@ -14,6 +14,6 @@ if str(_lib_dir) not in sys.path:
# Import the compiled C++ module # Import the compiled C++ module
# The .so file should be installed in this directory by xmake # The .so file should be installed in this directory by xmake
import _infinilm_llama import _infinilm
__all__ = ["_infinilm_llama"] __all__ = ["_infinilm"]
from ....generation.utils import GenerationMixin from ....generation.utils import GenerationMixin
import infinicore import infinicore
from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig from infinilm.models.llama.configuration_llama import LlamaConfig as _LlamaConfig
from infinilm.lib import _infinilm_llama from infinilm.lib import _infinilm
import json import json
import os import os
from typing import Optional, Union from typing import Optional, Union
...@@ -49,7 +49,7 @@ class LlamaConfig: ...@@ -49,7 +49,7 @@ class LlamaConfig:
def _underlying(self): def _underlying(self):
"""Get underlying C++ config object, creating it if needed""" """Get underlying C++ config object, creating it if needed"""
if self._cpp_config is None: if self._cpp_config is None:
self._cpp_config = _infinilm_llama.LlamaConfig() self._cpp_config = _infinilm.LlamaConfig()
# Copy attributes from Python config to C++ config # Copy attributes from Python config to C++ config
for key in dir(self._python_config): for key in dir(self._python_config):
...@@ -107,7 +107,7 @@ class LlamaForCausalLM(GenerationMixin): ...@@ -107,7 +107,7 @@ class LlamaForCausalLM(GenerationMixin):
self.use_cache = False self.use_cache = False
self._device = device self._device = device
self._model = _infinilm_llama.LlamaForCausalLM( self._model = _infinilm.LlamaForCausalLM(
config._underlying, device._underlying, dtype config._underlying, device._underlying, dtype
) )
......
...@@ -9,8 +9,8 @@ from setuptools.command.egg_info import egg_info ...@@ -9,8 +9,8 @@ from setuptools.command.egg_info import egg_info
def build_cpp_module(): def build_cpp_module():
"""Build and install the C++ extension module""" """Build and install the C++ extension module"""
subprocess.run(["xmake", "build", "_infinilm_llama"], check=True) subprocess.run(["xmake", "build", "_infinilm"], check=True)
subprocess.run(["xmake", "install", "_infinilm_llama"], check=True) subprocess.run(["xmake", "install", "_infinilm"], check=True)
class Build(build): class Build(build):
......
...@@ -27,7 +27,7 @@ except ImportError as e: ...@@ -27,7 +27,7 @@ except ImportError as e:
try: try:
from infinilm.models.llama import LlamaConfig, LlamaForCausalLM, Device from infinilm.models.llama import LlamaConfig, LlamaForCausalLM, Device
import _infinilm_llama # Import C++ bindings for HookRegistry import _infinilm # Import C++ bindings for HookRegistry
except ImportError as e: except ImportError as e:
print(f"Error: InfiniLM Python package not found. Please install it: {e}") print(f"Error: InfiniLM Python package not found. Please install it: {e}")
sys.exit(1) sys.exit(1)
...@@ -756,7 +756,7 @@ def test_intermediate_validation( ...@@ -756,7 +756,7 @@ def test_intermediate_validation(
infini_position_ids = torch_to_infinicore_tensor(position_ids, infini_device) infini_position_ids = torch_to_infinicore_tensor(position_ids, infini_device)
# Create hook registry and register hooks # Create hook registry and register hooks
hook_registry = _infinilm_llama.HookRegistry() hook_registry = _infinilm.HookRegistry()
def make_infinilm_hook(name): def make_infinilm_hook(name):
def hook(hook_name, tensor, layer_idx): def hook(hook_name, tensor, layer_idx):
......
...@@ -36,7 +36,7 @@ except ImportError as e: ...@@ -36,7 +36,7 @@ except ImportError as e:
print(f"Error: InfiniLM Python package not found. Please install it:") print(f"Error: InfiniLM Python package not found. Please install it:")
print(f" pip install -e .") print(f" pip install -e .")
print(f" or") print(f" or")
print(f" xmake build _infinilm_llama && xmake install _infinilm_llama") print(f" xmake build _infinilm && xmake install _infinilm")
print(f" Error: {e}") print(f" Error: {e}")
sys.exit(1) sys.exit(1)
...@@ -487,9 +487,6 @@ def validate_inference( ...@@ -487,9 +487,6 @@ def validate_inference(
def main(): def main():
"""Main test function""" """Main test function"""
# Default model path
# default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/LLM-Research/Llama-3.2-1B-Instruct"
default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/AI-ModelScope/TinyLlama-1.1B-Chat-v1.0"
# Default prompt # Default prompt
default_prompt = "Hello, how are you?" default_prompt = "Hello, how are you?"
...@@ -545,8 +542,6 @@ def main(): ...@@ -545,8 +542,6 @@ def main():
sys.exit(1) sys.exit(1)
i += 1 i += 1
if model_dir is None:
model_dir = default_model_dir
if not os.path.exists(model_dir): if not os.path.exists(model_dir):
print(f"Error: Model directory not found: {model_dir}") print(f"Error: Model directory not found: {model_dir}")
...@@ -560,11 +555,11 @@ def main(): ...@@ -560,11 +555,11 @@ def main():
) )
print(f" Examples: cpu, cuda, cuda:0, cuda:1") print(f" Examples: cpu, cuda, cuda:0, cuda:1")
print(f"\nExamples:") print(f"\nExamples:")
print(f" {sys.argv[0]} {default_model_dir}") print(f" {sys.argv[0]} dir/to/model")
print(f' {sys.argv[0]} {default_model_dir} --prompt "What is AI?"') print(f' {sys.argv[0]} dir/to/model --prompt "What is AI?"')
print(f" {sys.argv[0]} {default_model_dir} --device cuda:0") print(f" {sys.argv[0]} dir/to/model --device cuda:0")
print( print(
f' {sys.argv[0]} {default_model_dir} --prompt "What is AI?" --device cuda:0' f' {sys.argv[0]} dir/to/model --prompt "What is AI?" --device cuda:0'
) )
sys.exit(1) sys.exit(1)
......
...@@ -32,8 +32,7 @@ target("infinicore_infer") ...@@ -32,8 +32,7 @@ target("infinicore_infer")
add_installfiles("include/infinicore_infer/models/*.h", {prefixdir = "include/infinicore_infer/models"}) add_installfiles("include/infinicore_infer/models/*.h", {prefixdir = "include/infinicore_infer/models"})
target_end() target_end()
-- Python bindings for Llama model target("_infinilm")
target("_infinilm_llama")
add_packages("pybind11") add_packages("pybind11")
set_default(false) set_default(false)
add_rules("python.module", {soabi = true}) add_rules("python.module", {soabi = true})
...@@ -52,8 +51,7 @@ target("_infinilm_llama") ...@@ -52,8 +51,7 @@ target("_infinilm_llama")
add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl") add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl")
-- Add Llama model files -- Add Llama model files
add_files("csrc/models/llama/llama_*.cpp") add_files("csrc/models/*/*.cpp")
add_files("csrc/models/debug_utils/*.cpp")
add_files("csrc/models/pybind11/models.cc") add_files("csrc/models/pybind11/models.cc")
set_installdir("python/infinilm") set_installdir("python/infinilm")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment