Unverified Commit 4367f4bb authored by Ying Sheng's avatar Ying Sheng Committed by GitHub
Browse files

Fix prefill size (#711)

parent 00e4baa7
...@@ -103,6 +103,10 @@ class ModelTpServer: ...@@ -103,6 +103,10 @@ class ModelTpServer:
if server_args.max_running_requests is None if server_args.max_running_requests is None
else server_args.max_running_requests else server_args.max_running_requests
) )
self.max_running_requests = min(
self.max_running_requests,
self.model_runner.req_to_token_pool.size - 1
)
self.int_token_logit_bias = torch.tensor( self.int_token_logit_bias = torch.tensor(
get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size) get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size)
) )
......
...@@ -11,6 +11,7 @@ class ReqToTokenPool: ...@@ -11,6 +11,7 @@ class ReqToTokenPool:
"""A memory pool that maps a request to its token locations.""" """A memory pool that maps a request to its token locations."""
def __init__(self, size: int, max_context_len: int): def __init__(self, size: int, max_context_len: int):
self.size = size
self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda") self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda")
self.req_to_token = torch.empty( self.req_to_token = torch.empty(
(size, max_context_len), dtype=torch.int32, device="cuda" (size, max_context_len), dtype=torch.int32, device="cuda"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment