Unverified Commit 9216b106 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Improve performance when running with full parallel (#394)

parent da19434c
...@@ -348,6 +348,7 @@ class ModelRpcServer: ...@@ -348,6 +348,7 @@ class ModelRpcServer:
# Undo the insertion # Undo the insertion
delta = self.tree_cache.dec_ref_counter(req.last_node) delta = self.tree_cache.dec_ref_counter(req.last_node)
available_size += delta available_size += delta
break
else: else:
# Add this request to the running batch # Add this request to the running batch
self.token_to_kv_pool.add_refs(req.prefix_indices) self.token_to_kv_pool.add_refs(req.prefix_indices)
...@@ -356,7 +357,8 @@ class ModelRpcServer: ...@@ -356,7 +357,8 @@ class ModelRpcServer:
req.extend_input_len + req.max_new_tokens() req.extend_input_len + req.max_new_tokens()
) )
new_batch_input_tokens += req.extend_input_len new_batch_input_tokens += req.extend_input_len
else:
break
if len(can_run_list) == 0: if len(can_run_list) == 0:
return None return None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment