"git@developer.sourcefind.cn:yaoyuping/nndetection.git" did not exist on "45543128928ed2f181de80882467d32bff86efa8"
Commit e48b5b0d authored by PanZezhong's avatar PanZezhong
Browse files

issue/168 remove input lengths

parent bf74389d
...@@ -72,11 +72,6 @@ infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::De ...@@ -72,11 +72,6 @@ infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::De
} }
} }
std::optional<infinicore::Tensor> input_lengths_on_device;
if (input_lengths.has_value()) {
input_lengths_on_device = input_lengths.value()->to(device);
}
std::optional<infinicore::Tensor> input_offsets_on_device; std::optional<infinicore::Tensor> input_offsets_on_device;
if (input_offsets.has_value()) { if (input_offsets.has_value()) {
input_offsets_on_device = input_offsets.value()->to(device); input_offsets_on_device = input_offsets.value()->to(device);
...@@ -96,7 +91,6 @@ infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::De ...@@ -96,7 +91,6 @@ infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::De
input_ids, // @todo: on device in the future input_ids, // @todo: on device in the future
position_ids_on_device, position_ids_on_device,
cache_lengths_on_device, cache_lengths_on_device,
input_lengths_on_device,
input_offsets_on_device, input_offsets_on_device,
block_tables_on_device, block_tables_on_device,
slot_mapping_on_device}; slot_mapping_on_device};
......
...@@ -188,7 +188,7 @@ void RankWorker::thread_loop() { ...@@ -188,7 +188,7 @@ void RankWorker::thread_loop() {
Command local_cmd = Command::INIT; Command local_cmd = Command::INIT;
std::string local_param_name; std::string local_param_name;
infinicore::Tensor local_param; infinicore::Tensor local_param;
InfinilmModel::Input local_args; Input local_args;
std::unique_ptr<cache::CacheConfig> local_cache_config; std::unique_ptr<cache::CacheConfig> local_cache_config;
// Wait for a job or exit // Wait for a job or exit
...@@ -206,7 +206,7 @@ void RankWorker::thread_loop() { ...@@ -206,7 +206,7 @@ void RankWorker::thread_loop() {
local_param_name = pending_param_name_; local_param_name = pending_param_name_;
local_param = pending_param_; local_param = pending_param_;
} else if (local_cmd == Command::RUN) { } else if (local_cmd == Command::RUN) {
local_args = pending_args_.to_model_input(rank_info_.device); local_args = pending_args_;
} else if (local_cmd == Command::RESET_CACHE) { } else if (local_cmd == Command::RESET_CACHE) {
if (pending_cache_config_ != nullptr) { if (pending_cache_config_ != nullptr) {
local_cache_config = pending_cache_config_->unique_copy(); local_cache_config = pending_cache_config_->unique_copy();
...@@ -244,28 +244,28 @@ void RankWorker::thread_loop() { ...@@ -244,28 +244,28 @@ void RankWorker::thread_loop() {
{ {
std::lock_guard<std::mutex> lk(mutex_); std::lock_guard<std::mutex> lk(mutex_);
auto logits{model_->forward(local_args).logits}; auto model_args = local_args.to_model_input(rank_info_.device);
// Forward calculation
auto logits{model_->forward(model_args).logits};
// Random sampling (rank 0 only)
if (rank_info_.tp_rank == 0) { if (rank_info_.tp_rank == 0) {
// Perform random sampling. auto temperature{local_args.temperature};
auto temperature{pending_args_.temperature}; auto top_p{local_args.top_p};
auto top_p{pending_args_.top_p}; auto top_k{local_args.top_k};
auto top_k{pending_args_.top_k}; auto random_val{local_args.random_val};
auto random_val{pending_args_.random_val};
const auto &logits_shape{logits->shape()}; const auto &logits_shape{logits->shape()};
const auto &vocab_size{logits_shape[2]}; const auto &vocab_size{logits_shape[2]};
const auto &total_len{logits_shape[1]}; const auto &total_len{logits_shape[1]};
const auto &batch_size{logits_shape[0]}; const auto &batch_size{logits_shape[0]};
auto n_req = pending_args_.input_offsets.value()->size(0); auto n_req = local_args.input_offsets.value()->size(0) - 1;
int64_t *input_lengths = (int64_t *)pending_args_.input_lengths.value()->data(); int64_t *input_offsets = (int64_t *)local_args.input_offsets.value()->data();
int64_t *input_offsets = (int64_t *)pending_args_.input_offsets.value()->data();
auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)}; auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};
for (auto i{decltype(n_req)(0)}; i < n_req; ++i) { for (auto i{decltype(n_req)(0)}; i < n_req; ++i) {
auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i] + input_lengths[i] - 1), 1}})->view({vocab_size})}; auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i + 1] - 1), 1}})->view({vocab_size})};
auto out{output_ids->narrow({{0, i, 1}})->view({})}; auto out{output_ids->narrow({{0, i, 1}})->view({})};
infinicore::op::random_sample_( infinicore::op::random_sample_(
out, score, random_val, top_p, top_k, temperature); out, score, random_val, top_p, top_k, temperature);
......
...@@ -30,8 +30,6 @@ public: ...@@ -30,8 +30,6 @@ public:
std::optional<infinicore::Tensor> position_ids; std::optional<infinicore::Tensor> position_ids;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`. /// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
std::optional<infinicore::Tensor> cache_lengths; std::optional<infinicore::Tensor> cache_lengths;
/// Input Lengths of each request in a continous-batched sequence, of shape `[num_requests]`.
std::optional<infinicore::Tensor> input_lengths;
/// Offsets of each request in a continous-batched sequence, of shape `[num_requests]`. /// Offsets of each request in a continous-batched sequence, of shape `[num_requests]`.
std::optional<infinicore::Tensor> input_offsets; std::optional<infinicore::Tensor> input_offsets;
/// Block ids for each request `[batch, max_block_table_length]`. Used for paged cache. /// Block ids for each request `[batch, max_block_table_length]`. Used for paged cache.
......
...@@ -24,9 +24,7 @@ public: ...@@ -24,9 +24,7 @@ public:
std::optional<infinicore::Tensor> position_ids; std::optional<infinicore::Tensor> position_ids;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`. /// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
std::optional<infinicore::Tensor> cache_lengths; std::optional<infinicore::Tensor> cache_lengths;
/// Input Lengths of each request in a continous-batched sequence, of shape `[num_requests]`. /// Offsets of each request in a continous-batched sequence, of shape `[num_requests + 1]`.
std::optional<infinicore::Tensor> input_lengths;
/// Offsets of each request in a continous-batched sequence, of shape `[num_requests]`.
std::optional<infinicore::Tensor> input_offsets; std::optional<infinicore::Tensor> input_offsets;
/// Block ids for each request `[batch, max_block_table_length]`. Used for paged cache. /// Block ids for each request `[batch, max_block_table_length]`. Used for paged cache.
std::optional<infinicore::Tensor> block_tables; std::optional<infinicore::Tensor> block_tables;
......
...@@ -142,12 +142,10 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd ...@@ -142,12 +142,10 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<infinilm::cache::PagedKVCache> paged_kv_cache, std::shared_ptr<infinilm::cache::PagedKVCache> paged_kv_cache,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const { std::optional<infinicore::Tensor> slot_mapping) const {
ASSERT(block_tables.has_value()); ASSERT(block_tables.has_value());
ASSERT(input_lengths.has_value());
ASSERT(slot_mapping.has_value()); ASSERT(slot_mapping.has_value());
// Input shape: [batch, seq_len, hidden_size] // Input shape: [batch, seq_len, hidden_size]
...@@ -159,7 +157,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd ...@@ -159,7 +157,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
// Only support batchsize==1, all requests should be flattened along seqlen dimension // Only support batchsize==1, all requests should be flattened along seqlen dimension
ASSERT_EQ(batch_size, 1); ASSERT_EQ(batch_size, 1);
// Decode only if total_len == num_requests // Decode only if total_len == num_requests
bool is_prefill = (seq_len != input_lengths.value()->shape()[0]); bool is_prefill = (seq_len != cache_lengths.value()->shape()[0]);
// 1. Project Q, K, V // 1. Project Q, K, V
auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
...@@ -207,7 +205,6 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd ...@@ -207,7 +205,6 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
v_total, v_total,
block_tables.value(), block_tables.value(),
cache_lengths.value(), cache_lengths.value(),
input_lengths.value(),
input_offsets.value(), input_offsets.value(),
std::nullopt, std::nullopt,
scaling_); scaling_);
...@@ -233,7 +230,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -233,7 +230,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<cache::Cache> kv_cache, std::shared_ptr<cache::Cache> kv_cache,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const { std::optional<infinicore::Tensor> slot_mapping) const {
...@@ -243,7 +239,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat ...@@ -243,7 +239,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
infinicore::Tensor output; infinicore::Tensor output;
if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) { if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) {
output = forward_paged_(hidden_states, position_ids, paged_kv_cache, cache_lengths, input_lengths, input_offsets, block_tables, slot_mapping); output = forward_paged_(hidden_states, position_ids, paged_kv_cache, cache_lengths, input_offsets, block_tables, slot_mapping);
} else { } else {
output = forward_(hidden_states, position_ids, kv_cache, cache_lengths); output = forward_(hidden_states, position_ids, kv_cache, cache_lengths);
......
...@@ -52,7 +52,6 @@ public: ...@@ -52,7 +52,6 @@ public:
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<infinilm::cache::Cache> kv_cache, std::shared_ptr<infinilm::cache::Cache> kv_cache,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const; std::optional<infinicore::Tensor> slot_mapping) const;
...@@ -83,7 +82,6 @@ private: ...@@ -83,7 +82,6 @@ private:
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<infinilm::cache::PagedKVCache> kv_cache, std::shared_ptr<infinilm::cache::PagedKVCache> kv_cache,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const; std::optional<infinicore::Tensor> slot_mapping) const;
......
...@@ -27,7 +27,6 @@ infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_s ...@@ -27,7 +27,6 @@ infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_s
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<infinilm::cache::Cache> kv_cache, std::shared_ptr<infinilm::cache::Cache> kv_cache,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const { std::optional<infinicore::Tensor> slot_mapping) const {
...@@ -38,7 +37,7 @@ infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_s ...@@ -38,7 +37,7 @@ infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_s
auto normed_states = input_layernorm_->forward(hidden_states); auto normed_states = input_layernorm_->forward(hidden_states);
// 2. Self-attention with residual connection // 2. Self-attention with residual connection
auto attn_output = self_attn_->forward(normed_states, position_ids, kv_cache, cache_lengths, input_lengths, input_offsets, block_tables, slot_mapping); auto attn_output = self_attn_->forward(normed_states, position_ids, kv_cache, cache_lengths, input_offsets, block_tables, slot_mapping);
// Add residual: hidden_states = hidden_states + attn_output // Add residual: hidden_states = hidden_states + attn_output
auto output = infinicore::op::add(residual, attn_output); auto output = infinicore::op::add(residual, attn_output);
......
...@@ -50,7 +50,6 @@ public: ...@@ -50,7 +50,6 @@ public:
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<infinilm::cache::Cache> kv_cache, std::shared_ptr<infinilm::cache::Cache> kv_cache,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mappin) const; std::optional<infinicore::Tensor> slot_mappin) const;
......
...@@ -29,13 +29,12 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { ...@@ -29,13 +29,12 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
auto input_ids = input.input_ids.value(); auto input_ids = input.input_ids.value();
auto position_ids = input.position_ids.value(); auto position_ids = input.position_ids.value();
auto cache_lengths = input.cache_lengths; auto cache_lengths = input.cache_lengths;
auto input_lengths = input.input_lengths;
auto input_offsets = input.input_offsets; auto input_offsets = input.input_offsets;
auto block_tables = input.block_tables; auto block_tables = input.block_tables;
auto slot_mapping = input.slot_mapping; auto slot_mapping = input.slot_mapping;
// 1. Forward through base model to get hidden states // 1. Forward through base model to get hidden states
auto hidden_states = model_->forward(input_ids, position_ids, cache_lengths, input_lengths, input_offsets, block_tables, slot_mapping); auto hidden_states = model_->forward(input_ids, position_ids, cache_lengths, input_offsets, block_tables, slot_mapping);
// 2. Apply language modeling head to get logits // 2. Apply language modeling head to get logits
auto logits = lm_head_->forward(hidden_states); auto logits = lm_head_->forward(hidden_states);
......
...@@ -46,7 +46,6 @@ LlamaModel::LlamaModel(const LlamaConfig &config, ...@@ -46,7 +46,6 @@ LlamaModel::LlamaModel(const LlamaConfig &config,
infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids, infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const { std::optional<infinicore::Tensor> slot_mapping) const {
...@@ -56,7 +55,7 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids, ...@@ -56,7 +55,7 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
// 2. Process through all decoder layers // 2. Process through all decoder layers
size_t num_layers = layers_.size(); size_t num_layers = layers_.size();
for (size_t i = 0; i < num_layers; ++i) { for (size_t i = 0; i < num_layers; ++i) {
hidden_states = layers_.at(i)->forward(hidden_states, position_ids, kv_cache_, cache_lengths, input_lengths, input_offsets, block_tables, slot_mapping); hidden_states = layers_.at(i)->forward(hidden_states, position_ids, kv_cache_, cache_lengths, input_offsets, block_tables, slot_mapping);
} }
return norm_->forward(hidden_states); return norm_->forward(hidden_states);
......
...@@ -49,14 +49,12 @@ public: ...@@ -49,14 +49,12 @@ public:
* and tokens from all requests are concatenated along seq_len dimension. * and tokens from all requests are concatenated along seq_len dimension.
* @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len] * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
* @param cache_lengths Cache positions tensor of shape [n_req] * @param cache_lengths Cache positions tensor of shape [n_req]
* @param input_lengths Input lengths tensor in a continuous batch of shape [n_req] * @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req + 1]
* @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req]
* @return Output tensor of shape [batch, seq_len, hidden_size] * @return Output tensor of shape [batch, seq_len, hidden_size]
*/ */
infinicore::Tensor forward(const infinicore::Tensor &input_ids, infinicore::Tensor forward(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const; std::optional<infinicore::Tensor> slot_mapping) const;
......
...@@ -81,7 +81,6 @@ inline void bind_infer_engine(py::module &m) { ...@@ -81,7 +81,6 @@ inline void bind_infer_engine(py::module &m) {
std::optional<infinicore::Tensor> input_ids, std::optional<infinicore::Tensor> input_ids,
std::optional<infinicore::Tensor> position_ids, std::optional<infinicore::Tensor> position_ids,
std::optional<infinicore::Tensor> cache_lengths, std::optional<infinicore::Tensor> cache_lengths,
std::optional<infinicore::Tensor> input_lengths,
std::optional<infinicore::Tensor> input_offsets, std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> block_tables, std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping, std::optional<infinicore::Tensor> slot_mapping,
...@@ -90,7 +89,6 @@ inline void bind_infer_engine(py::module &m) { ...@@ -90,7 +89,6 @@ inline void bind_infer_engine(py::module &m) {
std::move(input_ids), std::move(input_ids),
std::move(position_ids), std::move(position_ids),
std::move(cache_lengths), std::move(cache_lengths),
std::move(input_lengths),
std::move(input_offsets), std::move(input_offsets),
std::move(block_tables), std::move(block_tables),
std::move(slot_mapping)}}; std::move(slot_mapping)}};
...@@ -112,14 +110,12 @@ inline void bind_infer_engine(py::module &m) { ...@@ -112,14 +110,12 @@ inline void bind_infer_engine(py::module &m) {
py::arg("input_ids") = std::nullopt, py::arg("input_ids") = std::nullopt,
py::arg("position_ids") = std::nullopt, py::arg("position_ids") = std::nullopt,
py::arg("cache_lengths") = std::nullopt, py::arg("cache_lengths") = std::nullopt,
py::arg("input_lengths") = std::nullopt,
py::arg("input_offsets") = std::nullopt, py::arg("input_offsets") = std::nullopt,
py::arg("block_tables") = std::nullopt, py::arg("block_tables") = std::nullopt,
py::arg("slot_mapping") = std::nullopt) py::arg("slot_mapping") = std::nullopt)
.def_readwrite("input_ids", &InferEngine::Input::input_ids) .def_readwrite("input_ids", &InferEngine::Input::input_ids)
.def_readwrite("position_ids", &InferEngine::Input::position_ids) .def_readwrite("position_ids", &InferEngine::Input::position_ids)
.def_readwrite("cache_lengths", &InferEngine::Input::cache_lengths) .def_readwrite("cache_lengths", &InferEngine::Input::cache_lengths)
.def_readwrite("input_lengths", &InferEngine::Input::input_lengths)
.def_readwrite("input_offsets", &InferEngine::Input::input_offsets) .def_readwrite("input_offsets", &InferEngine::Input::input_offsets)
.def_readwrite("block_tables", &InferEngine::Input::block_tables) .def_readwrite("block_tables", &InferEngine::Input::block_tables)
.def_readwrite("slot_mapping", &InferEngine::Input::slot_mapping); .def_readwrite("slot_mapping", &InferEngine::Input::slot_mapping);
......
...@@ -54,7 +54,6 @@ class InferEngine(_infinilm.InferEngine): ...@@ -54,7 +54,6 @@ class InferEngine(_infinilm.InferEngine):
*, *,
position_ids=None, position_ids=None,
cache_lengths=None, cache_lengths=None,
input_lengths=None,
input_offsets=None, input_offsets=None,
block_tables=None, block_tables=None,
slot_mapping=None, slot_mapping=None,
...@@ -66,7 +65,6 @@ class InferEngine(_infinilm.InferEngine): ...@@ -66,7 +65,6 @@ class InferEngine(_infinilm.InferEngine):
input_ids = input_ids._underlying if input_ids is not None else None input_ids = input_ids._underlying if input_ids is not None else None
position_ids = position_ids._underlying if position_ids is not None else None position_ids = position_ids._underlying if position_ids is not None else None
cache_lengths = cache_lengths._underlying if cache_lengths is not None else None cache_lengths = cache_lengths._underlying if cache_lengths is not None else None
input_lengths = input_lengths._underlying if input_lengths is not None else None
input_offsets = input_offsets._underlying if input_offsets is not None else None input_offsets = input_offsets._underlying if input_offsets is not None else None
block_tables = block_tables._underlying if block_tables is not None else None block_tables = block_tables._underlying if block_tables is not None else None
slot_mapping = slot_mapping._underlying if slot_mapping is not None else None slot_mapping = slot_mapping._underlying if slot_mapping is not None else None
...@@ -78,7 +76,6 @@ class InferEngine(_infinilm.InferEngine): ...@@ -78,7 +76,6 @@ class InferEngine(_infinilm.InferEngine):
input_ids, input_ids,
position_ids=position_ids, position_ids=position_ids,
cache_lengths=cache_lengths, cache_lengths=cache_lengths,
input_lengths=input_lengths,
input_offsets=input_offsets, input_offsets=input_offsets,
block_tables=block_tables, block_tables=block_tables,
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
...@@ -125,12 +122,9 @@ class InferEngine(_infinilm.InferEngine): ...@@ -125,12 +122,9 @@ class InferEngine(_infinilm.InferEngine):
cache_lengths = infinicore.from_list( cache_lengths = infinicore.from_list(
[past_seq_len] * batch_size, dtype=infinicore.int64 [past_seq_len] * batch_size, dtype=infinicore.int64
) )
input_lengths = infinicore.from_list(
[seq_len] * batch_size, dtype=infinicore.int64
)
input_offsets = infinicore.from_list( input_offsets = infinicore.from_list(
[seq_len * i for i in range(batch_size)], dtype=infinicore.int64 [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int64
) )
block_tables = infinicore.from_list( block_tables = infinicore.from_list(
[ [
...@@ -160,15 +154,15 @@ class InferEngine(_infinilm.InferEngine): ...@@ -160,15 +154,15 @@ class InferEngine(_infinilm.InferEngine):
], ],
dtype=infinicore.int64, dtype=infinicore.int64,
) )
cache_lengths = infinicore.from_list( cache_lengths = infinicore.from_list(
[past_seq_len], dtype=infinicore.int64 [past_seq_len], dtype=infinicore.int64
) )
input_lengths = infinicore.from_list(
[seq_len] * batch_size, dtype=infinicore.int64
)
input_offsets = infinicore.from_list( input_offsets = infinicore.from_list(
[seq_len * i for i in range(batch_size)], dtype=infinicore.int64 [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int64
) )
block_tables = None block_tables = None
slot_mapping = None slot_mapping = None
...@@ -176,7 +170,6 @@ class InferEngine(_infinilm.InferEngine): ...@@ -176,7 +170,6 @@ class InferEngine(_infinilm.InferEngine):
input_ids=input_ids, input_ids=input_ids,
position_ids=position_ids, position_ids=position_ids,
cache_lengths=cache_lengths, cache_lengths=cache_lengths,
input_lengths=input_lengths,
input_offsets=input_offsets, input_offsets=input_offsets,
block_tables=block_tables, block_tables=block_tables,
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
...@@ -188,7 +181,8 @@ class InferEngine(_infinilm.InferEngine): ...@@ -188,7 +181,8 @@ class InferEngine(_infinilm.InferEngine):
output_ids.append(output_id) output_ids.append(output_id)
if ( if (
generation_config.stop_on_eos initial_batch_size == 1
and generation_config.stop_on_eos
and generation_config.max_new_tokens is not None and generation_config.max_new_tokens is not None
and output_id.to_numpy()[0] in eos_token_id and output_id.to_numpy()[0] in eos_token_id
): ):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment