issue/257 fix(llm): sync inference service with FLA engine interface changes

30e93254 · MaYuhang · 70561bd9 · 30e93254 · 30e93254 · 30e93254
Commit 30e93254 authored Mar 09, 2026 by MaYuhang
Showing with 9 additions and 4 deletions

python/infinilm/llm/llm.py python/infinilm/llm/llm.py +4 -4

python/infinilm/llm/scheduler.py python/infinilm/llm/scheduler.py +4 -0

python/infinilm/llm/static_scheduler.py python/infinilm/llm/static_scheduler.py +1 -0

No files found.
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -223,16 +223,16 @@ class LLMEngine:
            if value is None:
                # Skip None values (block_tables/slot_mapping for static cache)
                model_input[key] = None
+            elif key in ["input_ids", "position_ids", "slot_mapping"]:
+                model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
            elif key in [
-                "input_ids",
-                "position_ids",
                "past_kv_lengths",
                "total_kv_lengths",
                "input_offsets",
-                "slot_mapping",
+                "cu_seqlens",
                "block_tables",
            ]:
-                model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
+                model_input[key] = infinicore.from_list(value, dtype=infinicore.int32)
            else:
                # temperature, top_k, top_p, etc.
                model_input[key] = value

--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -101,6 +101,9 @@ class SchedulerOutput:
                max_block_table_len - len(req.block_table)
            )
            block_tables.append(padded_block_table)
+            cu_seqlens = [0]
+            for l in seq_lens:
+                cu_seqlens.append(cu_seqlens[-1] + l)

        return {
            "input_ids": [tokens],
@@ -108,6 +111,7 @@ class SchedulerOutput:
            "past_kv_lengths": cached_lens,
            "total_kv_lengths": seq_lens,
            "input_offsets": seq_offsets,
+            "cu_seqlens": cu_seqlens,
            "block_tables": block_tables,
            "slot_mapping": slot_mapping,
            "temperature": temperature,

--- a/python/infinilm/llm/static_scheduler.py
+++ b/python/infinilm/llm/static_scheduler.py
@@ -75,6 +75,7 @@ class StaticSchedulerOutput:
            "past_kv_lengths": [past_kv_len],
            "total_kv_lengths": [total_kv_len],
            "input_offsets": input_offsets,
+            "cu_seqlens": [0, total_kv_len],
            "block_tables": None,
            "slot_mapping": None,
            "temperature": temperature,