Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
95d937c5
Commit
95d937c5
authored
Feb 23, 2025
by
DDong Jianwei
Browse files
tmp
parent
cdb6f896
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
13 additions
and
8 deletions
+13
-8
ktransformers/local_chat.py
ktransformers/local_chat.py
+7
-2
ktransformers/operators/attention.py
ktransformers/operators/attention.py
+3
-3
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+2
-2
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+1
-1
No files found.
ktransformers/local_chat.py
View file @
95d937c5
...
@@ -58,7 +58,7 @@ def local_chat(
...
@@ -58,7 +58,7 @@ def local_chat(
gguf_path
:
str
|
None
=
None
,
gguf_path
:
str
|
None
=
None
,
max_new_tokens
:
int
=
300
,
max_new_tokens
:
int
=
300
,
cpu_infer
:
int
=
Config
().
cpu_infer
,
cpu_infer
:
int
=
Config
().
cpu_infer
,
use_cuda_graph
:
bool
=
Tru
e
,
use_cuda_graph
:
bool
=
Fals
e
,
prompt_file
:
str
|
None
=
None
,
prompt_file
:
str
|
None
=
None
,
mode
:
str
=
"normal"
,
mode
:
str
=
"normal"
,
force_think
:
bool
=
False
,
force_think
:
bool
=
False
,
...
@@ -160,6 +160,9 @@ def local_chat(
...
@@ -160,6 +160,9 @@ def local_chat(
input_tensor
=
tokenizer
.
apply_chat_template
(
input_tensor
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
return_tensors
=
"pt"
messages
,
add_generation_prompt
=
True
,
return_tensors
=
"pt"
)
)
# input_tensor = torch.tensor([[0, 6657, 84646]], device=input_tensor.device)
if
force_think
:
if
force_think
:
token_thinks
=
torch
.
tensor
([
tokenizer
.
encode
(
"<think>
\\
n"
,
add_special_tokens
=
False
)],
device
=
input_tensor
.
device
)
token_thinks
=
torch
.
tensor
([
tokenizer
.
encode
(
"<think>
\\
n"
,
add_special_tokens
=
False
)],
device
=
input_tensor
.
device
)
input_tensor
=
torch
.
cat
(
input_tensor
=
torch
.
cat
(
...
@@ -181,4 +184,6 @@ def local_chat(
...
@@ -181,4 +184,6 @@ def local_chat(
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
fire
.
Fire
(
local_chat
)
# fire.Fire(local_chat)
\ No newline at end of file
# local_chat(model_path="/mnt/data/model/DeepSeek-R1", gguf_path="/mnt/data/model/DeepseekV3-q4km-gguf", cpu_infer=33, force_think=False)
local_chat
(
model_path
=
"/mnt/data/model/Moonlight-16B-A3B-Instruct"
,
gguf_path
=
"/mnt/data/model/Moonlight-16B-A3B-Instruct-GGUF"
,
cpu_infer
=
33
,
force_think
=
False
)
\ No newline at end of file
ktransformers/operators/attention.py
View file @
95d937c5
...
@@ -441,10 +441,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
...
@@ -441,10 +441,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
# mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
# mla_wrapper run output: [tokens, self.num_heads, self.kv_lora_rank]
# attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
# attn_output [bsz, q_len, self.num_heads, self.kv_lora_rank]
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
attn_output
=
attn_output
.
transpose
(
1
,
2
)
attn_output
=
attn_output
.
transpose
(
1
,
2
)
# [bsz, self.num_heads, q_len, self.kv_lora_rank]
attn_output
=
torch
.
matmul
(
attn_output
,
out_absorb
.
mT
)
attn_output
=
torch
.
matmul
(
attn_output
,
out_absorb
.
mT
)
# [bsz, self.num_heads, q_len, self.v_head_dim]
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
*
self
.
v_head_dim
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
*
self
.
v_head_dim
)
# [bsz, q_len, self.num_heads * self.v_head_dim]
attn_output
=
self
.
o_proj
(
attn_output
)
attn_output
=
self
.
o_proj
(
attn_output
)
return
attn_output
,
None
,
past_key_value
return
attn_output
,
None
,
past_key_value
...
...
ktransformers/operators/experts.py
View file @
95d937c5
...
@@ -450,9 +450,9 @@ class KExpertsTorch(KExpertsBase):
...
@@ -450,9 +450,9 @@ class KExpertsTorch(KExpertsBase):
self
.
up
[
i
]
=
w
[
"up"
][
i
,
...].
to
(
device
=
device
,
dtype
=
self
.
dtype
)
self
.
up
[
i
]
=
w
[
"up"
][
i
,
...].
to
(
device
=
device
,
dtype
=
self
.
dtype
)
self
.
down
[
i
]
=
w
[
"down"
][
i
,
...].
to
(
device
=
device
,
dtype
=
self
.
dtype
)
self
.
down
[
i
]
=
w
[
"down"
][
i
,
...].
to
(
device
=
device
,
dtype
=
self
.
dtype
)
self
.
up
=
torch
.
cat
(
self
.
gate
,
dim
=
0
)
self
.
up
=
torch
.
cat
(
self
.
up
,
dim
=
0
)
self
.
gate
=
torch
.
cat
(
self
.
gate
,
dim
=
0
)
self
.
gate
=
torch
.
cat
(
self
.
gate
,
dim
=
0
)
self
.
down
=
torch
.
cat
(
self
.
gate
,
dim
=
0
)
self
.
down
=
torch
.
cat
(
self
.
down
,
dim
=
0
)
return
return
def
unload
(
self
):
def
unload
(
self
):
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
View file @
95d937c5
-
match
:
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
replace
:
class
:
ktransformers.operators.RoPE.
Yarn
RotaryEmbeddingV3
class
:
ktransformers.operators.RoPE.RotaryEmbeddingV3
kwargs
:
kwargs
:
generate_device
:
"
cuda"
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
prefill_device
:
"
cuda"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment