Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
e48b5b0d
Commit
e48b5b0d
authored
Jan 06, 2026
by
PanZezhong
Browse files
issue/168 remove input lengths
parent
bf74389d
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
27 additions
and
59 deletions
+27
-59
csrc/engine/infer_engine.cpp
csrc/engine/infer_engine.cpp
+0
-6
csrc/engine/rank_worker.cpp
csrc/engine/rank_worker.cpp
+13
-13
csrc/engine/rank_worker.hpp
csrc/engine/rank_worker.hpp
+0
-2
csrc/models/infinilm_model.hpp
csrc/models/infinilm_model.hpp
+1
-3
csrc/models/llama/llama_attention.cpp
csrc/models/llama/llama_attention.cpp
+2
-6
csrc/models/llama/llama_attention.hpp
csrc/models/llama/llama_attention.hpp
+0
-2
csrc/models/llama/llama_decoder_layer.cpp
csrc/models/llama/llama_decoder_layer.cpp
+1
-2
csrc/models/llama/llama_decoder_layer.hpp
csrc/models/llama/llama_decoder_layer.hpp
+0
-1
csrc/models/llama/llama_for_causal_lm.cpp
csrc/models/llama/llama_for_causal_lm.cpp
+1
-2
csrc/models/llama/llama_model.cpp
csrc/models/llama/llama_model.cpp
+1
-2
csrc/models/llama/llama_model.hpp
csrc/models/llama/llama_model.hpp
+1
-3
csrc/pybind11/engine/engine.hpp
csrc/pybind11/engine/engine.hpp
+0
-4
python/infinilm/infer_engine.py
python/infinilm/infer_engine.py
+7
-13
No files found.
csrc/engine/infer_engine.cpp
View file @
e48b5b0d
...
...
@@ -72,11 +72,6 @@ infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::De
}
}
std
::
optional
<
infinicore
::
Tensor
>
input_lengths_on_device
;
if
(
input_lengths
.
has_value
())
{
input_lengths_on_device
=
input_lengths
.
value
()
->
to
(
device
);
}
std
::
optional
<
infinicore
::
Tensor
>
input_offsets_on_device
;
if
(
input_offsets
.
has_value
())
{
input_offsets_on_device
=
input_offsets
.
value
()
->
to
(
device
);
...
...
@@ -96,7 +91,6 @@ infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::De
input_ids
,
// @todo: on device in the future
position_ids_on_device
,
cache_lengths_on_device
,
input_lengths_on_device
,
input_offsets_on_device
,
block_tables_on_device
,
slot_mapping_on_device
};
...
...
csrc/engine/rank_worker.cpp
View file @
e48b5b0d
...
...
@@ -188,7 +188,7 @@ void RankWorker::thread_loop() {
Command
local_cmd
=
Command
::
INIT
;
std
::
string
local_param_name
;
infinicore
::
Tensor
local_param
;
InfinilmModel
::
Input
local_args
;
Input
local_args
;
std
::
unique_ptr
<
cache
::
CacheConfig
>
local_cache_config
;
// Wait for a job or exit
...
...
@@ -206,7 +206,7 @@ void RankWorker::thread_loop() {
local_param_name
=
pending_param_name_
;
local_param
=
pending_param_
;
}
else
if
(
local_cmd
==
Command
::
RUN
)
{
local_args
=
pending_args_
.
to_model_input
(
rank_info_
.
device
)
;
local_args
=
pending_args_
;
}
else
if
(
local_cmd
==
Command
::
RESET_CACHE
)
{
if
(
pending_cache_config_
!=
nullptr
)
{
local_cache_config
=
pending_cache_config_
->
unique_copy
();
...
...
@@ -244,28 +244,28 @@ void RankWorker::thread_loop() {
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
mutex_
);
auto
logits
{
model_
->
forward
(
local_args
).
logits
};
auto
model_args
=
local_args
.
to_model_input
(
rank_info_
.
device
);
// Forward calculation
auto
logits
{
model_
->
forward
(
model_args
).
logits
};
// Random sampling (rank 0 only)
if
(
rank_info_
.
tp_rank
==
0
)
{
// Perform random sampling.
auto
temperature
{
pending_args_
.
temperature
};
auto
top_p
{
pending_args_
.
top_p
};
auto
top_k
{
pending_args_
.
top_k
};
auto
random_val
{
pending_args_
.
random_val
};
auto
temperature
{
local_args
.
temperature
};
auto
top_p
{
local_args
.
top_p
};
auto
top_k
{
local_args
.
top_k
};
auto
random_val
{
local_args
.
random_val
};
const
auto
&
logits_shape
{
logits
->
shape
()};
const
auto
&
vocab_size
{
logits_shape
[
2
]};
const
auto
&
total_len
{
logits_shape
[
1
]};
const
auto
&
batch_size
{
logits_shape
[
0
]};
auto
n_req
=
pending_args_
.
input_offsets
.
value
()
->
size
(
0
);
int64_t
*
input_lengths
=
(
int64_t
*
)
pending_args_
.
input_lengths
.
value
()
->
data
();
int64_t
*
input_offsets
=
(
int64_t
*
)
pending_args_
.
input_offsets
.
value
()
->
data
();
auto
n_req
=
local_args
.
input_offsets
.
value
()
->
size
(
0
)
-
1
;
int64_t
*
input_offsets
=
(
int64_t
*
)
local_args
.
input_offsets
.
value
()
->
data
();
auto
output_ids
{
infinicore
::
Tensor
::
empty
({
n_req
},
infinicore
::
DataType
::
I64
,
rank_info_
.
device
)};
for
(
auto
i
{
decltype
(
n_req
)(
0
)};
i
<
n_req
;
++
i
)
{
auto
score
{
logits
->
view
({
batch_size
*
total_len
,
vocab_size
})
->
narrow
({{
0
,
size_t
(
input_offsets
[
i
]
+
input_lengths
[
i
]
-
1
),
1
}})
->
view
({
vocab_size
})};
auto
score
{
logits
->
view
({
batch_size
*
total_len
,
vocab_size
})
->
narrow
({{
0
,
size_t
(
input_offsets
[
i
+
1
]
-
1
),
1
}})
->
view
({
vocab_size
})};
auto
out
{
output_ids
->
narrow
({{
0
,
i
,
1
}})
->
view
({})};
infinicore
::
op
::
random_sample_
(
out
,
score
,
random_val
,
top_p
,
top_k
,
temperature
);
...
...
csrc/engine/rank_worker.hpp
View file @
e48b5b0d
...
...
@@ -30,8 +30,6 @@ public:
std
::
optional
<
infinicore
::
Tensor
>
position_ids
;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
;
/// Input Lengths of each request in a continous-batched sequence, of shape `[num_requests]`.
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
;
/// Offsets of each request in a continous-batched sequence, of shape `[num_requests]`.
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
;
/// Block ids for each request `[batch, max_block_table_length]`. Used for paged cache.
...
...
csrc/models/infinilm_model.hpp
View file @
e48b5b0d
...
...
@@ -24,9 +24,7 @@ public:
std
::
optional
<
infinicore
::
Tensor
>
position_ids
;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
;
/// Input Lengths of each request in a continous-batched sequence, of shape `[num_requests]`.
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
;
/// Offsets of each request in a continous-batched sequence, of shape `[num_requests]`.
/// Offsets of each request in a continous-batched sequence, of shape `[num_requests + 1]`.
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
;
/// Block ids for each request `[batch, max_block_table_length]`. Used for paged cache.
std
::
optional
<
infinicore
::
Tensor
>
block_tables
;
...
...
csrc/models/llama/llama_attention.cpp
View file @
e48b5b0d
...
...
@@ -142,12 +142,10 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
const
infinicore
::
Tensor
&
position_ids
,
std
::
shared_ptr
<
infinilm
::
cache
::
PagedKVCache
>
paged_kv_cache
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
ASSERT
(
block_tables
.
has_value
());
ASSERT
(
input_lengths
.
has_value
());
ASSERT
(
slot_mapping
.
has_value
());
// Input shape: [batch, seq_len, hidden_size]
...
...
@@ -159,7 +157,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
// Only support batchsize==1, all requests should be flattened along seqlen dimension
ASSERT_EQ
(
batch_size
,
1
);
// Decode only if total_len == num_requests
bool
is_prefill
=
(
seq_len
!=
input
_lengths
.
value
()
->
shape
()[
0
]);
bool
is_prefill
=
(
seq_len
!=
cache
_lengths
.
value
()
->
shape
()[
0
]);
// 1. Project Q, K, V
auto
[
q
,
k
,
v
]
=
qkv_proj_
->
forward_split
(
hidden_states_mutable
);
...
...
@@ -207,7 +205,6 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
v_total
,
block_tables
.
value
(),
cache_lengths
.
value
(),
input_lengths
.
value
(),
input_offsets
.
value
(),
std
::
nullopt
,
scaling_
);
...
...
@@ -233,7 +230,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
const
infinicore
::
Tensor
&
position_ids
,
std
::
shared_ptr
<
cache
::
Cache
>
kv_cache
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
...
...
@@ -243,7 +239,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
infinicore
::
Tensor
output
;
if
(
auto
paged_kv_cache
=
std
::
dynamic_pointer_cast
<
cache
::
PagedKVCache
>
(
kv_cache
))
{
output
=
forward_paged_
(
hidden_states
,
position_ids
,
paged_kv_cache
,
cache_lengths
,
input_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
output
=
forward_paged_
(
hidden_states
,
position_ids
,
paged_kv_cache
,
cache_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
}
else
{
output
=
forward_
(
hidden_states
,
position_ids
,
kv_cache
,
cache_lengths
);
...
...
csrc/models/llama/llama_attention.hpp
View file @
e48b5b0d
...
...
@@ -52,7 +52,6 @@ public:
const
infinicore
::
Tensor
&
position_ids
,
std
::
shared_ptr
<
infinilm
::
cache
::
Cache
>
kv_cache
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
;
...
...
@@ -83,7 +82,6 @@ private:
const
infinicore
::
Tensor
&
position_ids
,
std
::
shared_ptr
<
infinilm
::
cache
::
PagedKVCache
>
kv_cache
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
;
...
...
csrc/models/llama/llama_decoder_layer.cpp
View file @
e48b5b0d
...
...
@@ -27,7 +27,6 @@ infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_s
const
infinicore
::
Tensor
&
position_ids
,
std
::
shared_ptr
<
infinilm
::
cache
::
Cache
>
kv_cache
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
...
...
@@ -38,7 +37,7 @@ infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_s
auto
normed_states
=
input_layernorm_
->
forward
(
hidden_states
);
// 2. Self-attention with residual connection
auto
attn_output
=
self_attn_
->
forward
(
normed_states
,
position_ids
,
kv_cache
,
cache_lengths
,
input_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
auto
attn_output
=
self_attn_
->
forward
(
normed_states
,
position_ids
,
kv_cache
,
cache_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
// Add residual: hidden_states = hidden_states + attn_output
auto
output
=
infinicore
::
op
::
add
(
residual
,
attn_output
);
...
...
csrc/models/llama/llama_decoder_layer.hpp
View file @
e48b5b0d
...
...
@@ -50,7 +50,6 @@ public:
const
infinicore
::
Tensor
&
position_ids
,
std
::
shared_ptr
<
infinilm
::
cache
::
Cache
>
kv_cache
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mappin
)
const
;
...
...
csrc/models/llama/llama_for_causal_lm.cpp
View file @
e48b5b0d
...
...
@@ -29,13 +29,12 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
auto
input_ids
=
input
.
input_ids
.
value
();
auto
position_ids
=
input
.
position_ids
.
value
();
auto
cache_lengths
=
input
.
cache_lengths
;
auto
input_lengths
=
input
.
input_lengths
;
auto
input_offsets
=
input
.
input_offsets
;
auto
block_tables
=
input
.
block_tables
;
auto
slot_mapping
=
input
.
slot_mapping
;
// 1. Forward through base model to get hidden states
auto
hidden_states
=
model_
->
forward
(
input_ids
,
position_ids
,
cache_lengths
,
input_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
auto
hidden_states
=
model_
->
forward
(
input_ids
,
position_ids
,
cache_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
// 2. Apply language modeling head to get logits
auto
logits
=
lm_head_
->
forward
(
hidden_states
);
...
...
csrc/models/llama/llama_model.cpp
View file @
e48b5b0d
...
...
@@ -46,7 +46,6 @@ LlamaModel::LlamaModel(const LlamaConfig &config,
infinicore
::
Tensor
LlamaModel
::
forward
(
const
infinicore
::
Tensor
&
input_ids
,
const
infinicore
::
Tensor
&
position_ids
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
...
...
@@ -56,7 +55,7 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
// 2. Process through all decoder layers
size_t
num_layers
=
layers_
.
size
();
for
(
size_t
i
=
0
;
i
<
num_layers
;
++
i
)
{
hidden_states
=
layers_
.
at
(
i
)
->
forward
(
hidden_states
,
position_ids
,
kv_cache_
,
cache_lengths
,
input_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
hidden_states
=
layers_
.
at
(
i
)
->
forward
(
hidden_states
,
position_ids
,
kv_cache_
,
cache_lengths
,
input_offsets
,
block_tables
,
slot_mapping
);
}
return
norm_
->
forward
(
hidden_states
);
...
...
csrc/models/llama/llama_model.hpp
View file @
e48b5b0d
...
...
@@ -49,14 +49,12 @@ public:
* and tokens from all requests are concatenated along seq_len dimension.
* @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
* @param cache_lengths Cache positions tensor of shape [n_req]
* @param input_lengths Input lengths tensor in a continuous batch of shape [n_req]
* @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req]
* @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req + 1]
* @return Output tensor of shape [batch, seq_len, hidden_size]
*/
infinicore
::
Tensor
forward
(
const
infinicore
::
Tensor
&
input_ids
,
const
infinicore
::
Tensor
&
position_ids
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
;
...
...
csrc/pybind11/engine/engine.hpp
View file @
e48b5b0d
...
...
@@ -81,7 +81,6 @@ inline void bind_infer_engine(py::module &m) {
std
::
optional
<
infinicore
::
Tensor
>
input_ids
,
std
::
optional
<
infinicore
::
Tensor
>
position_ids
,
std
::
optional
<
infinicore
::
Tensor
>
cache_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_lengths
,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
...
...
@@ -90,7 +89,6 @@ inline void bind_infer_engine(py::module &m) {
std
::
move
(
input_ids
),
std
::
move
(
position_ids
),
std
::
move
(
cache_lengths
),
std
::
move
(
input_lengths
),
std
::
move
(
input_offsets
),
std
::
move
(
block_tables
),
std
::
move
(
slot_mapping
)}};
...
...
@@ -112,14 +110,12 @@ inline void bind_infer_engine(py::module &m) {
py
::
arg
(
"input_ids"
)
=
std
::
nullopt
,
py
::
arg
(
"position_ids"
)
=
std
::
nullopt
,
py
::
arg
(
"cache_lengths"
)
=
std
::
nullopt
,
py
::
arg
(
"input_lengths"
)
=
std
::
nullopt
,
py
::
arg
(
"input_offsets"
)
=
std
::
nullopt
,
py
::
arg
(
"block_tables"
)
=
std
::
nullopt
,
py
::
arg
(
"slot_mapping"
)
=
std
::
nullopt
)
.
def_readwrite
(
"input_ids"
,
&
InferEngine
::
Input
::
input_ids
)
.
def_readwrite
(
"position_ids"
,
&
InferEngine
::
Input
::
position_ids
)
.
def_readwrite
(
"cache_lengths"
,
&
InferEngine
::
Input
::
cache_lengths
)
.
def_readwrite
(
"input_lengths"
,
&
InferEngine
::
Input
::
input_lengths
)
.
def_readwrite
(
"input_offsets"
,
&
InferEngine
::
Input
::
input_offsets
)
.
def_readwrite
(
"block_tables"
,
&
InferEngine
::
Input
::
block_tables
)
.
def_readwrite
(
"slot_mapping"
,
&
InferEngine
::
Input
::
slot_mapping
);
...
...
python/infinilm/infer_engine.py
View file @
e48b5b0d
...
...
@@ -54,7 +54,6 @@ class InferEngine(_infinilm.InferEngine):
*
,
position_ids
=
None
,
cache_lengths
=
None
,
input_lengths
=
None
,
input_offsets
=
None
,
block_tables
=
None
,
slot_mapping
=
None
,
...
...
@@ -66,7 +65,6 @@ class InferEngine(_infinilm.InferEngine):
input_ids
=
input_ids
.
_underlying
if
input_ids
is
not
None
else
None
position_ids
=
position_ids
.
_underlying
if
position_ids
is
not
None
else
None
cache_lengths
=
cache_lengths
.
_underlying
if
cache_lengths
is
not
None
else
None
input_lengths
=
input_lengths
.
_underlying
if
input_lengths
is
not
None
else
None
input_offsets
=
input_offsets
.
_underlying
if
input_offsets
is
not
None
else
None
block_tables
=
block_tables
.
_underlying
if
block_tables
is
not
None
else
None
slot_mapping
=
slot_mapping
.
_underlying
if
slot_mapping
is
not
None
else
None
...
...
@@ -78,7 +76,6 @@ class InferEngine(_infinilm.InferEngine):
input_ids
,
position_ids
=
position_ids
,
cache_lengths
=
cache_lengths
,
input_lengths
=
input_lengths
,
input_offsets
=
input_offsets
,
block_tables
=
block_tables
,
slot_mapping
=
slot_mapping
,
...
...
@@ -125,12 +122,9 @@ class InferEngine(_infinilm.InferEngine):
cache_lengths
=
infinicore
.
from_list
(
[
past_seq_len
]
*
batch_size
,
dtype
=
infinicore
.
int64
)
input_lengths
=
infinicore
.
from_list
(
[
seq_len
]
*
batch_size
,
dtype
=
infinicore
.
int64
)
input_offsets
=
infinicore
.
from_list
(
[
seq_len
*
i
for
i
in
range
(
batch_size
)],
dtype
=
infinicore
.
int64
[
seq_len
*
i
for
i
in
range
(
batch_size
+
1
)],
dtype
=
infinicore
.
int64
)
block_tables
=
infinicore
.
from_list
(
[
...
...
@@ -160,15 +154,15 @@ class InferEngine(_infinilm.InferEngine):
],
dtype
=
infinicore
.
int64
,
)
cache_lengths
=
infinicore
.
from_list
(
[
past_seq_len
],
dtype
=
infinicore
.
int64
)
input_lengths
=
infinicore
.
from_list
(
[
seq_len
]
*
batch_size
,
dtype
=
infinicore
.
int64
)
input_offsets
=
infinicore
.
from_list
(
[
seq_len
*
i
for
i
in
range
(
batch_size
)],
dtype
=
infinicore
.
int64
[
seq_len
*
i
for
i
in
range
(
batch_size
+
1
)],
dtype
=
infinicore
.
int64
)
block_tables
=
None
slot_mapping
=
None
...
...
@@ -176,7 +170,6 @@ class InferEngine(_infinilm.InferEngine):
input_ids
=
input_ids
,
position_ids
=
position_ids
,
cache_lengths
=
cache_lengths
,
input_lengths
=
input_lengths
,
input_offsets
=
input_offsets
,
block_tables
=
block_tables
,
slot_mapping
=
slot_mapping
,
...
...
@@ -188,7 +181,8 @@ class InferEngine(_infinilm.InferEngine):
output_ids
.
append
(
output_id
)
if
(
generation_config
.
stop_on_eos
initial_batch_size
==
1
and
generation_config
.
stop_on_eos
and
generation_config
.
max_new_tokens
is
not
None
and
output_id
.
to_numpy
()[
0
]
in
eos_token_id
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment