Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2d580e7a
Unverified
Commit
2d580e7a
authored
May 12, 2024
by
Lianmin Zheng
Committed by
GitHub
May 12, 2024
Browse files
Fix flashinfer (#430)
parent
3fc97f67
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
5 deletions
+6
-5
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/managers/router/model_rpc.py
python/sglang/srt/managers/router/model_rpc.py
+2
-1
python/sglang/srt/managers/router/model_runner.py
python/sglang/srt/managers/router/model_runner.py
+3
-3
No files found.
python/pyproject.toml
View file @
2d580e7a
...
...
@@ -20,7 +20,7 @@ dependencies = [
[project.optional-dependencies]
srt
=
[
"aiohttp"
,
"fastapi"
,
"psutil"
,
"rpyc"
,
"torch"
,
"uvloop"
,
"uvicorn"
,
"zmq"
,
"vllm>=0.4.2"
,
"interegular"
,
"pydantic"
,
"pillow"
,
"outlines>=0.0.27"
,
"flashinfer>=0.0.4"
,
"packaging"
]
"zmq"
,
"vllm>=0.4.2"
,
"interegular"
,
"pydantic"
,
"pillow"
,
"outlines>=0.0.27"
,
"packaging"
]
openai
=
[
"openai>=1.0"
,
"numpy"
,
"tiktoken"
]
anthropic
=
[
"anthropic>=0.20.0"
,
"numpy"
]
all
=
["sglang[srt]
", "
sglang
[openai]
", "
sglang
[anthropic]"]
...
...
python/sglang/srt/managers/router/model_rpc.py
View file @
2d580e7a
...
...
@@ -113,7 +113,8 @@ class ModelRpcServer:
f
"max_prefill_num_token=
{
self
.
max_prefill_num_token
}
, "
f
"context_len=
{
self
.
model_config
.
context_len
}
, "
)
logger
.
info
(
f
"server_args:
{
server_args
.
print_mode_args
()
}
"
)
if
self
.
tp_rank
==
0
:
logger
.
info
(
f
"server_args:
{
server_args
.
print_mode_args
()
}
"
)
# Init cache
self
.
tree_cache
=
RadixCache
(
disable
=
server_args
.
disable_radix_cache
)
...
...
python/sglang/srt/managers/router/model_runner.py
View file @
2d580e7a
...
...
@@ -110,12 +110,12 @@ class InputMetadata:
self
.
kv_last_page_len
=
torch
.
ones
(
(
self
.
batch_size
,),
dtype
=
torch
.
int32
,
device
=
"cuda"
)
req_pool_indices_cpu
=
self
.
req_pool_indices
.
cpu
().
numpy
()
seq_lens_cpu
=
self
.
seq_lens
.
cpu
().
numpy
()
req_pool_indices_cpu
=
self
.
req_pool_indices
.
cpu
().
tolist
()
seq_lens_cpu
=
self
.
seq_lens
.
tolist
()
self
.
kv_indices
=
torch
.
cat
(
[
self
.
req_to_token_pool
.
req_to_token
[
req_pool_indices_cpu
[
i
]:
seq_lens_cpu
[
i
]
req_pool_indices_cpu
[
i
]
,
:
seq_lens_cpu
[
i
]
]
for
i
in
range
(
self
.
batch_size
)
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment