Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
f029588b
Commit
f029588b
authored
Feb 18, 2025
by
Xie Weiyu
Browse files
fix server warmup
parent
9f1da186
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
14 deletions
+17
-14
ktransformers/server/backend/interfaces/ktransformers.py
ktransformers/server/backend/interfaces/ktransformers.py
+15
-12
ktransformers/server/backend/interfaces/transformers.py
ktransformers/server/backend/interfaces/transformers.py
+2
-2
No files found.
ktransformers/server/backend/interfaces/ktransformers.py
View file @
f029588b
...
...
@@ -73,13 +73,13 @@ class KTransformersInterface(TransformersInterface):
self
.
_infer_lock
=
asyncio
.
Lock
()
def
decode_one_tokens
(
self
,
i
):
def
decode_one_tokens
(
self
):
device_map
=
self
.
model
.
gguf_loader
.
tensor_device_map
torch_device
=
get_device
(
"blk.0.self_attn"
,
device_map
)
torch_device
=
"cuda:0"
if
torch_device
==
"cuda"
else
torch_device
global
warm_uped
if
self
.
args
.
use_cuda_graph
and
(
(
warm_uped
==
True
and
int
(
i
)
==
1
)
or
(
warm_uped
==
False
and
int
(
i
)
==
2
)
)
:
warm_uped
=
True
if
self
.
args
.
use_cuda_graph
and
warm_uped
==
True
:
if
not
hasattr
(
self
,
"cuda_graph_runner"
):
self
.
cuda_graph_runner
=
CUDAGraphRunner
()
self
.
cuda_graph_runner
.
capture
(
...
...
@@ -93,15 +93,18 @@ class KTransformersInterface(TransformersInterface):
use_cache
=
True
,
)
if
hasattr
(
self
,
"cuda_graph_runner"
):
logits
=
self
.
cuda_graph_runner
(
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
)
self
.
cache
.
change_seq_length
(
1
)
torch
.
cuda
.
synchronize
()
logits
=
logits
[
0
,
-
1
,
:]
return
self
.
logits_to_token
(
logits
)
if
hasattr
(
self
,
"cuda_graph_runner"
):
logits
=
self
.
cuda_graph_runner
(
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
)
self
.
cache
.
change_seq_length
(
1
)
torch
.
cuda
.
synchronize
()
logits
=
logits
[
0
,
-
1
,
:]
return
self
.
logits_to_token
(
logits
)
if
self
.
args
.
use_cuda_graph
:
warm_uped
=
True
if
self
.
use_static_cache
:
mask
=
torch
.
ones
((
1
,
self
.
seq_length
)).
to
(
torch_device
)
logits
=
self
.
model
(
...
...
ktransformers/server/backend/interfaces/transformers.py
View file @
f029588b
...
...
@@ -219,7 +219,7 @@ class TransformersInterface(BackendInterfaceBase):
self
.
ever_generated_ids
.
add
(
last
)
return
last
def
decode_one_tokens
(
self
,
i
):
def
decode_one_tokens
(
self
):
if
self
.
use_static_cache
:
mask
=
torch
.
ones
((
1
,
self
.
seq_length
)).
to
(
self
.
args
.
device
)
logits
=
self
.
model
(
...
...
@@ -299,7 +299,7 @@ class TransformersInterface(BackendInterfaceBase):
num_heads
=
self
.
model
.
config
.
num_attention_heads
,
head_dim_ckv
=
self
.
model
.
config
.
kv_lora_rank
,
head_dim_kpe
=
self
.
model
.
config
.
qk_rope_head_dim
,
page_size
=
self
.
cache
.
page_size
,
sm_scale
=
(
self
.
model
.
config
.
qk_rope_head_dim
+
self
.
model
.
config
.
qk_nope_head_dim
)
**
(
-
0.5
),
q_data_type
=
torch
.
bfloat16
,
kv_data_type
=
torch
.
bfloat16
)
next_token
=
self
.
decode_one_tokens
(
i
)
next_token
=
self
.
decode_one_tokens
()
self
.
profiler
.
inc
(
"decode"
)
if
next_token
==
self
.
tokenizer
.
eos_token_id
:
assert
self
.
args
.
batch_size
==
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment