Unverified Commit dfec9d89 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #258 from InfiniTensor/issue/257

issue/257 fix(llm): sync inference service with FLA engine interface …
parents 70561bd9 30e93254
...@@ -223,16 +223,16 @@ class LLMEngine: ...@@ -223,16 +223,16 @@ class LLMEngine:
if value is None: if value is None:
# Skip None values (block_tables/slot_mapping for static cache) # Skip None values (block_tables/slot_mapping for static cache)
model_input[key] = None model_input[key] = None
elif key in ["input_ids", "position_ids", "slot_mapping"]:
model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
elif key in [ elif key in [
"input_ids",
"position_ids",
"past_kv_lengths", "past_kv_lengths",
"total_kv_lengths", "total_kv_lengths",
"input_offsets", "input_offsets",
"slot_mapping", "cu_seqlens",
"block_tables", "block_tables",
]: ]:
model_input[key] = infinicore.from_list(value, dtype=infinicore.int64) model_input[key] = infinicore.from_list(value, dtype=infinicore.int32)
else: else:
# temperature, top_k, top_p, etc. # temperature, top_k, top_p, etc.
model_input[key] = value model_input[key] = value
......
...@@ -101,6 +101,9 @@ class SchedulerOutput: ...@@ -101,6 +101,9 @@ class SchedulerOutput:
max_block_table_len - len(req.block_table) max_block_table_len - len(req.block_table)
) )
block_tables.append(padded_block_table) block_tables.append(padded_block_table)
cu_seqlens = [0]
for l in seq_lens:
cu_seqlens.append(cu_seqlens[-1] + l)
return { return {
"input_ids": [tokens], "input_ids": [tokens],
...@@ -108,6 +111,7 @@ class SchedulerOutput: ...@@ -108,6 +111,7 @@ class SchedulerOutput:
"past_kv_lengths": cached_lens, "past_kv_lengths": cached_lens,
"total_kv_lengths": seq_lens, "total_kv_lengths": seq_lens,
"input_offsets": seq_offsets, "input_offsets": seq_offsets,
"cu_seqlens": cu_seqlens,
"block_tables": block_tables, "block_tables": block_tables,
"slot_mapping": slot_mapping, "slot_mapping": slot_mapping,
"temperature": temperature, "temperature": temperature,
......
...@@ -75,6 +75,7 @@ class StaticSchedulerOutput: ...@@ -75,6 +75,7 @@ class StaticSchedulerOutput:
"past_kv_lengths": [past_kv_len], "past_kv_lengths": [past_kv_len],
"total_kv_lengths": [total_kv_len], "total_kv_lengths": [total_kv_len],
"input_offsets": input_offsets, "input_offsets": input_offsets,
"cu_seqlens": [0, total_kv_len],
"block_tables": None, "block_tables": None,
"slot_mapping": None, "slot_mapping": None,
"temperature": temperature, "temperature": temperature,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment