Unverified Commit 2d580e7a authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix flashinfer (#430)

parent 3fc97f67
...@@ -20,7 +20,7 @@ dependencies = [ ...@@ -20,7 +20,7 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
"zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "flashinfer>=0.0.4", "packaging"] "zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "packaging"]
openai = ["openai>=1.0", "numpy", "tiktoken"] openai = ["openai>=1.0", "numpy", "tiktoken"]
anthropic = ["anthropic>=0.20.0", "numpy"] anthropic = ["anthropic>=0.20.0", "numpy"]
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"] all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
......
...@@ -113,7 +113,8 @@ class ModelRpcServer: ...@@ -113,7 +113,8 @@ class ModelRpcServer:
f"max_prefill_num_token={self.max_prefill_num_token}, " f"max_prefill_num_token={self.max_prefill_num_token}, "
f"context_len={self.model_config.context_len}, " f"context_len={self.model_config.context_len}, "
) )
logger.info(f"server_args: {server_args.print_mode_args()}") if self.tp_rank == 0:
logger.info(f"server_args: {server_args.print_mode_args()}")
# Init cache # Init cache
self.tree_cache = RadixCache(disable=server_args.disable_radix_cache) self.tree_cache = RadixCache(disable=server_args.disable_radix_cache)
......
...@@ -110,12 +110,12 @@ class InputMetadata: ...@@ -110,12 +110,12 @@ class InputMetadata:
self.kv_last_page_len = torch.ones( self.kv_last_page_len = torch.ones(
(self.batch_size,), dtype=torch.int32, device="cuda" (self.batch_size,), dtype=torch.int32, device="cuda"
) )
req_pool_indices_cpu = self.req_pool_indices.cpu().numpy() req_pool_indices_cpu = self.req_pool_indices.cpu().tolist()
seq_lens_cpu = self.seq_lens.cpu().numpy() seq_lens_cpu = self.seq_lens.tolist()
self.kv_indices = torch.cat( self.kv_indices = torch.cat(
[ [
self.req_to_token_pool.req_to_token[ self.req_to_token_pool.req_to_token[
req_pool_indices_cpu[i]: seq_lens_cpu[i] req_pool_indices_cpu[i], : seq_lens_cpu[i]
] ]
for i in range(self.batch_size) for i in range(self.batch_size)
], ],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment