Commit 30e93254 authored by MaYuhang's avatar MaYuhang
Browse files

issue/257 fix(llm): sync inference service with FLA engine interface changes

parent 70561bd9
......@@ -223,16 +223,16 @@ class LLMEngine:
if value is None:
# Skip None values (block_tables/slot_mapping for static cache)
model_input[key] = None
elif key in ["input_ids", "position_ids", "slot_mapping"]:
model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
elif key in [
"input_ids",
"position_ids",
"past_kv_lengths",
"total_kv_lengths",
"input_offsets",
"slot_mapping",
"cu_seqlens",
"block_tables",
]:
model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
model_input[key] = infinicore.from_list(value, dtype=infinicore.int32)
else:
# temperature, top_k, top_p, etc.
model_input[key] = value
......
......@@ -101,6 +101,9 @@ class SchedulerOutput:
max_block_table_len - len(req.block_table)
)
block_tables.append(padded_block_table)
cu_seqlens = [0]
for l in seq_lens:
cu_seqlens.append(cu_seqlens[-1] + l)
return {
"input_ids": [tokens],
......@@ -108,6 +111,7 @@ class SchedulerOutput:
"past_kv_lengths": cached_lens,
"total_kv_lengths": seq_lens,
"input_offsets": seq_offsets,
"cu_seqlens": cu_seqlens,
"block_tables": block_tables,
"slot_mapping": slot_mapping,
"temperature": temperature,
......
......@@ -75,6 +75,7 @@ class StaticSchedulerOutput:
"past_kv_lengths": [past_kv_len],
"total_kv_lengths": [total_kv_len],
"input_offsets": input_offsets,
"cu_seqlens": [0, total_kv_len],
"block_tables": None,
"slot_mapping": None,
"temperature": temperature,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment