Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
26f7b4af
Unverified
Commit
26f7b4af
authored
Feb 27, 2025
by
wang jiahao
Committed by
GitHub
Feb 27, 2025
Browse files
Merge branch 'main' into temperature_top_p_from_request
parents
07eb712a
1f28f75f
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
709 additions
and
100 deletions
+709
-100
ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+9
-9
ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
...mers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
+2
-0
ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
...ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+3
-0
ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
...transformers_ext/operators/kvcache/kvcache_read_write.cpp
+2
-0
ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
...ers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
+2
-0
ktransformers/ktransformers_ext/triton/fp8gemm.py
ktransformers/ktransformers_ext/triton/fp8gemm.py
+193
-0
ktransformers/local_chat.py
ktransformers/local_chat.py
+7
-7
ktransformers/operators/attention.py
ktransformers/operators/attention.py
+37
-18
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+13
-4
ktransformers/operators/flashinfer_wrapper.py
ktransformers/operators/flashinfer_wrapper.py
+30
-9
ktransformers/operators/gate.py
ktransformers/operators/gate.py
+10
-3
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+68
-5
ktransformers/operators/models.py
ktransformers/operators/models.py
+9
-4
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
...imize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
+63
-0
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
...optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
+4
-0
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
...s/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
+157
-0
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
...ize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
+2
-2
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+1
-0
ktransformers/server/api/ollama/completions.py
ktransformers/server/api/ollama/completions.py
+94
-39
ktransformers/server/api/openai/endpoints/chat.py
ktransformers/server/api/openai/endpoints/chat.py
+3
-0
No files found.
ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
View file @
26f7b4af
/**
* @Description :
* @Description :
* @Author : Azure-Tang
* @Date : 2024-07-22 09:27:55
* @Version : 1.0.0
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-12 03:48:46
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
...
...
@@ -13,10 +13,10 @@
#include <torch/extension.h>
#include <torch/torch.h>
torch
::
Tensor
dequantize_q8_0
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_q6_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_q5_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_q4_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_q3_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_q2_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_iq4_xs
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
ScalarT
ype
target_dtype
);
torch
::
Tensor
dequantize_q8_0
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
torch
::
Tensor
dequantize_q6_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
torch
::
Tensor
dequantize_q5_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
torch
::
Tensor
dequantize_q4_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
torch
::
Tensor
dequantize_q3_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
torch
::
Tensor
dequantize_q2_k
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
torch
::
Tensor
dequantize_iq4_xs
(
const
int8_t
*
data
,
const
int
num_bytes
,
const
int
blk_size
,
const
int
ele_per_blk
,
const
torch
::
Device
device
,
const
torch
::
Dt
ype
target_dtype
);
ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
View file @
26f7b4af
...
...
@@ -10,6 +10,8 @@
#include "kvcache.h"
#include <chrono>
void
KVCache
::
attention_kvhead_
(
const
uint16_t
*
q_in_data
,
ggml_fp16_t
*
output
,
float
*
attn_lse
,
int
batch_size
,
Backend
*
backend
)
{
...
...
ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
View file @
26f7b4af
...
...
@@ -9,6 +9,9 @@
**/
#include "kvcache.h"
#include <chrono>
void
KVCache
::
load_kvcache
(
std
::
string
tensor_file_path
,
Backend
*
backend
)
{
// Timer start
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
...
...
ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
View file @
26f7b4af
...
...
@@ -10,6 +10,8 @@
#include "kvcache.h"
#include <chrono>
void
KVCache
::
get_anchor_one_block
(
ggml_fp16_t
*
anchor
,
int
layer_id
,
int
block_idx
,
Backend
*
backend
)
{
// Timer start
...
...
ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
View file @
26f7b4af
...
...
@@ -10,6 +10,8 @@
#include "kvcache.h"
#include <chrono>
std
::
string
ggml_type_to_string
(
ggml_type
type
)
{
switch
(
type
)
{
case
GGML_TYPE_F32
:
...
...
ktransformers/ktransformers_ext/triton/fp8gemm.py
0 → 100644
View file @
26f7b4af
# Adopted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
from
typing
import
Tuple
import
torch
import
triton
import
triton.language
as
tl
from
triton
import
Config
@
triton
.
jit
def
act_quant_kernel
(
x_ptr
,
y_ptr
,
s_ptr
,
BLOCK_SIZE
:
tl
.
constexpr
):
"""
Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.
Args:
x_ptr (triton.Pointer): Pointer to the input tensor.
y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
Returns:
None
"""
pid
=
tl
.
program_id
(
axis
=
0
)
offs
=
pid
*
BLOCK_SIZE
+
tl
.
arange
(
0
,
BLOCK_SIZE
)
x
=
tl
.
load
(
x_ptr
+
offs
).
to
(
tl
.
float32
)
s
=
tl
.
max
(
tl
.
abs
(
x
))
/
448.
y
=
x
/
s
y
=
y
.
to
(
y_ptr
.
dtype
.
element_ty
)
tl
.
store
(
y_ptr
+
offs
,
y
)
tl
.
store
(
s_ptr
+
pid
,
s
)
def
act_quant
(
x
:
torch
.
Tensor
,
block_size
:
int
=
128
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Quantizes the input tensor `x` using block-wise quantization.
Args:
x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
Returns:
Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
- The quantized tensor with dtype `torch.float8_e4m3fn`.
- A tensor of scaling factors with dtype `torch.float32`.
"""
assert
x
.
is_contiguous
(),
'Input tensor must be contiguous'
assert
x
.
size
(
-
1
)
%
block_size
==
0
,
f
'Last dimension size must be divisible by block_size (block_size=
{
block_size
}
)'
y
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
float8_e4m3fn
)
s
=
x
.
new_empty
(
*
x
.
size
()[:
-
1
],
x
.
size
(
-
1
)
//
block_size
,
dtype
=
torch
.
float32
)
grid
=
lambda
meta
:
(
triton
.
cdiv
(
x
.
numel
(),
meta
[
'BLOCK_SIZE'
]),
)
act_quant_kernel
[
grid
](
x
,
y
,
s
,
BLOCK_SIZE
=
block_size
)
return
y
,
s
@
triton
.
jit
def
weight_dequant_kernel
(
x_ptr
,
s_ptr
,
y_ptr
,
M
,
N
,
BLOCK_SIZE
:
tl
.
constexpr
):
"""
Dequantizes weights using the provided scaling factors and stores the result.
Args:
x_ptr (tl.pointer): Pointer to the quantized weights.
s_ptr (tl.pointer): Pointer to the scaling factors.
y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
M (int): Number of rows in the weight matrix.
N (int): Number of columns in the weight matrix.
BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
Returns:
None
"""
pid_m
=
tl
.
program_id
(
axis
=
0
)
pid_n
=
tl
.
program_id
(
axis
=
1
)
n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE
)
offs_m
=
pid_m
*
BLOCK_SIZE
+
tl
.
arange
(
0
,
BLOCK_SIZE
)
offs_n
=
pid_n
*
BLOCK_SIZE
+
tl
.
arange
(
0
,
BLOCK_SIZE
)
offs
=
offs_m
[:,
None
]
*
N
+
offs_n
[
None
,
:]
mask
=
(
offs_m
[:,
None
]
<
M
)
&
(
offs_n
[
None
,
:]
<
N
)
x
=
tl
.
load
(
x_ptr
+
offs
,
mask
=
mask
).
to
(
tl
.
float32
)
s
=
tl
.
load
(
s_ptr
+
pid_m
*
n
+
pid_n
)
y
=
x
*
s
tl
.
store
(
y_ptr
+
offs
,
y
,
mask
=
mask
)
def
weight_dequant
(
x
:
torch
.
Tensor
,
s
:
torch
.
Tensor
,
block_size
:
int
=
128
)
->
torch
.
Tensor
:
"""
Dequantizes the given weight tensor using the provided scale tensor.
Args:
x (torch.Tensor): The quantized weight tensor of shape (M, N).
s (torch.Tensor): The scale tensor of shape (M, N).
block_size (int, optional): The block size to use for dequantization. Defaults to 128.
Returns:
torch.Tensor: The dequantized weight tensor of the same shape as `x`.
Raises:
AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
"""
assert
x
.
is_contiguous
()
and
s
.
is_contiguous
(),
'Input tensors must be contiguous'
assert
x
.
dim
()
==
2
and
s
.
dim
()
==
2
,
'Input tensors must have 2 dimensions'
M
,
N
=
x
.
size
()
y
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
get_default_dtype
())
grid
=
lambda
meta
:
(
triton
.
cdiv
(
M
,
meta
[
'BLOCK_SIZE'
]),
triton
.
cdiv
(
N
,
meta
[
'BLOCK_SIZE'
]))
with
torch
.
cuda
.
device
(
x
.
device
):
weight_dequant_kernel
[
grid
](
x
,
s
,
y
,
M
,
N
,
BLOCK_SIZE
=
block_size
)
return
y
fp8_gemm_configs
=
[
Config
({
'BLOCK_SIZE_M'
:
block_m
,
'BLOCK_SIZE_N'
:
block_n
,
'BLOCK_SIZE_K'
:
128
},
num_stages
=
num_stages
,
num_warps
=
8
)
for
block_m
in
[
16
,
32
,
64
]
for
block_n
in
[
32
,
64
,
128
]
for
num_stages
in
[
3
,
4
,
5
,
6
]
]
@
triton
.
autotune
(
configs
=
fp8_gemm_configs
,
key
=
[
'N'
,
'K'
])
@
triton
.
jit
def
fp8_gemm_kernel
(
a_ptr
,
b_ptr
,
c_ptr
,
a_s_ptr
,
b_s_ptr
,
M
,
N
:
tl
.
constexpr
,
K
:
tl
.
constexpr
,
BLOCK_SIZE_M
:
tl
.
constexpr
,
BLOCK_SIZE_N
:
tl
.
constexpr
,
BLOCK_SIZE_K
:
tl
.
constexpr
):
"""
Performs a matrix multiplication operation on FP8 matrices with scaling factors.
Args:
a_ptr (tl.tensor): Pointer to the first input matrix A.
b_ptr (tl.tensor): Pointer to the second input matrix B.
c_ptr (tl.tensor): Pointer to the output matrix C.
a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
M (int): Number of rows in matrix A and C.
N (tl.constexpr): Number of columns in matrix B and C.
K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.
Returns:
None
"""
pid_m
=
tl
.
program_id
(
axis
=
0
)
pid_n
=
tl
.
program_id
(
axis
=
1
)
k
=
tl
.
cdiv
(
K
,
BLOCK_SIZE_K
)
offs_m
=
(
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
))
%
M
offs_n
=
(
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
))
%
N
offs_k
=
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
a_ptrs
=
a_ptr
+
offs_m
[:,
None
]
*
K
+
offs_k
[
None
,
:]
b_ptrs
=
b_ptr
+
offs_n
[
None
,
:]
*
K
+
offs_k
[:,
None
]
a_s_ptrs
=
a_s_ptr
+
offs_m
*
k
b_s_ptrs
=
b_s_ptr
+
(
offs_n
//
BLOCK_SIZE_K
)
*
k
accumulator
=
tl
.
zeros
((
BLOCK_SIZE_M
,
BLOCK_SIZE_N
),
dtype
=
tl
.
float32
)
for
i
in
range
(
k
):
a
=
tl
.
load
(
a_ptrs
,
mask
=
offs_k
[
None
,
:]
<
K
-
i
*
BLOCK_SIZE_K
,
other
=
0.0
)
b
=
tl
.
load
(
b_ptrs
,
mask
=
offs_k
[:,
None
]
<
K
-
i
*
BLOCK_SIZE_K
,
other
=
0.0
)
a_s
=
tl
.
load
(
a_s_ptrs
)
b_s
=
tl
.
load
(
b_s_ptrs
)
accumulator
+=
tl
.
dot
(
a
,
b
)
*
a_s
[:,
None
]
*
b_s
[
None
,
:]
a_ptrs
+=
BLOCK_SIZE_K
b_ptrs
+=
BLOCK_SIZE_K
a_s_ptrs
+=
1
b_s_ptrs
+=
1
c
=
accumulator
.
to
(
c_ptr
.
dtype
.
element_ty
)
offs_m
=
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
offs_n
=
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
)
c_ptrs
=
c_ptr
+
offs_m
[:,
None
]
*
N
+
offs_n
[
None
,
:]
mask
=
(
offs_m
[:,
None
]
<
M
)
&
(
offs_n
[
None
,
:]
<
N
)
tl
.
store
(
c_ptrs
,
c
,
mask
=
mask
)
def
fp8_gemm
(
a
:
torch
.
Tensor
,
a_s
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
b_s
:
torch
.
Tensor
):
"""
Perform a matrix multiplication using FP8 precision.
Args:
a (torch.Tensor): The first input matrix, must be contiguous.
a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
b (torch.Tensor): The second input matrix, must be contiguous.
b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
Returns:
torch.Tensor: The result of the matrix multiplication.
"""
assert
a
.
is_contiguous
()
and
b
.
is_contiguous
(),
'Input tensors must be contiguous'
assert
a_s
.
is_contiguous
()
and
b_s
.
is_contiguous
(),
'Scaling factor tensors must be contiguous'
K
=
a
.
size
(
-
1
)
M
=
a
.
numel
()
//
K
N
=
b
.
size
(
0
)
c
=
a
.
new_empty
(
*
a
.
size
()[:
-
1
],
N
,
dtype
=
torch
.
get_default_dtype
())
grid
=
lambda
META
:
(
triton
.
cdiv
(
M
,
META
[
'BLOCK_SIZE_M'
]),
triton
.
cdiv
(
N
,
META
[
'BLOCK_SIZE_N'
]))
fp8_gemm_kernel
[
grid
](
a
,
b
,
c
,
a_s
,
b_s
,
M
,
N
,
K
)
return
c
\ No newline at end of file
ktransformers/local_chat.py
View file @
26f7b4af
...
...
@@ -28,7 +28,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from
ktransformers.models.modeling_deepseek_v3
import
DeepseekV3ForCausalLM
from
ktransformers.models.modeling_llama
import
LlamaForCausalLM
from
ktransformers.models.modeling_mixtral
import
MixtralForCausalLM
from
ktransformers.util.utils
import
prefill_and_generate
from
ktransformers.util.utils
import
prefill_and_generate
,
get_compute_capability
from
ktransformers.server.config.config
import
Config
from
ktransformers.operators.flashinfer_wrapper
import
flashinfer_enabled
...
...
@@ -54,7 +54,7 @@ default_optimize_rules = {
def
local_chat
(
model_path
:
str
|
None
=
None
,
optimize_
rule
_path
:
str
=
None
,
optimize_
config
_path
:
str
=
None
,
gguf_path
:
str
|
None
=
None
,
max_new_tokens
:
int
=
300
,
cpu_infer
:
int
=
Config
().
cpu_infer
,
...
...
@@ -94,12 +94,12 @@ def local_chat(
config
,
trust_remote_code
=
True
,
attn_implementation
=
"flash_attention_2"
)
if
optimize_
rule
_path
is
None
:
if
optimize_
config
_path
is
None
:
if
config
.
architectures
[
0
]
in
default_optimize_rules
:
print
(
"using default_optimize_rule for"
,
config
.
architectures
[
0
])
optimize_
rule
_path
=
default_optimize_rules
[
config
.
architectures
[
0
]]
optimize_
config
_path
=
default_optimize_rules
[
config
.
architectures
[
0
]]
else
:
optimize_
rule
_path
=
input
(
optimize_
config
_path
=
input
(
"please input the path of your rule file(yaml file containing optimize rules):"
)
...
...
@@ -107,7 +107,7 @@ def local_chat(
gguf_path
=
input
(
"please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
)
optimize_and_load_gguf
(
model
,
optimize_
rule
_path
,
gguf_path
,
config
)
optimize_and_load_gguf
(
model
,
optimize_
config
_path
,
gguf_path
,
config
)
try
:
model
.
generation_config
=
GenerationConfig
.
from_pretrained
(
model_path
)
...
...
@@ -168,7 +168,7 @@ def local_chat(
assert
Config
().
long_context_config
[
'max_seq_len'
]
>
input_tensor
.
shape
[
1
]
+
max_new_tokens
,
\
"please change max_seq_len in ~/.ktransformers/config.yaml"
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
:
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
and
get_compute_capability
()
>=
8
:
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
use_flashinfer_mla
=
True
,
num_heads
=
config
.
num_attention_heads
,
head_dim_ckv
=
config
.
kv_lora_rank
,
head_dim_kpe
=
config
.
qk_rope_head_dim
,
q_head_dim
=
config
.
qk_rope_head_dim
+
config
.
qk_nope_head_dim
...
...
ktransformers/operators/attention.py
View file @
26f7b4af
...
...
@@ -16,6 +16,7 @@ from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_ro
from
typing
import
Optional
,
Tuple
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.utils
import
get_compute_capability
import
logging
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.cache_utils
import
Cache
...
...
@@ -48,12 +49,14 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
prefill_device
:
str
=
"cuda"
,
generate_device
:
str
=
"cuda"
,
chunck_size
:
int
=
1000
,
absorb_for_prefill
:
bool
=
False
,
**
kwargs
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
prefill_device
,
generate_device
,
**
kwargs
)
self
.
orig_module
.
__init__
(
orig_module
.
config
,
orig_module
.
layer_idx
)
self
.
chunck_size
=
chunck_size
# TODO, generate chunck_size automatically.
self
.
mla_wrapper
=
None
self
.
absorb_for_prefill
=
absorb_for_prefill
def
get_absorbed
(
self
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
if
not
(
hasattr
(
self
,
'q_absorb'
)
and
hasattr
(
self
,
'out_absorb'
)):
...
...
@@ -242,7 +245,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
q_nope
=
q_nope
.
transpose
(
1
,
2
)
# q_len is 1, no GPU overhead, same below
q_nope
=
torch
.
matmul
(
q_nope
,
q_absorb
)
# batched MM
q_nope
=
q_nope
.
transpose
(
1
,
2
)
assert
q_nope
.
is_contiguous
()
#
assert q_nope.is_contiguous()
# q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
# q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
...
...
@@ -282,6 +285,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
attn_output
=
attn_output
.
transpose
(
1
,
2
)
attn_output
=
torch
.
matmul
(
attn_output
,
out_absorb
.
mT
)
attn_output
=
attn_output
.
transpose
(
1
,
2
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
*
self
.
v_head_dim
)
attn_output
=
self
.
o_proj
(
attn_output
)
...
...
@@ -380,7 +384,7 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
# q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim] k_pe [bsz, q_len, 1, self.qk_rope_head_dim]
# decode
if
q_len
==
1
:
if
q_len
==
1
or
self
.
absorb_for_prefill
:
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
,
"cache_position"
:
cache_position
}
# Specific to RoPE models
compressed_kv_with_k_pe
,
page_table
=
past_key_value
.
update
(
compressed_kv
,
k_pe
,
self
.
layer_idx
,
cache_kwargs
)
...
...
@@ -395,29 +399,42 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
q_nope
=
q_nope
.
transpose
(
1
,
2
)
# q_len is 1, no GPU overhead, same below
q_nope
=
torch
.
matmul
(
q_nope
,
q_absorb
)
# batched MM
q_nope
=
q_nope
.
transpose
(
1
,
2
)
assert
q_nope
.
is_contiguous
()
q_nope
=
q_nope
.
contiguous
()
#assert q_nope.is_contiguous()
# q_nope [bsz, q_len, self.num_heads, self.kv_lora_rank]
# q_pe [bsz, q_len, self.num_heads, self.qk_rope_head_dim]
q_nope
.
squeeze_
(
1
)
q_pe
.
squeeze_
(
1
)
q_nope
.
squeeze_
(
0
)
q_pe
.
squeeze_
(
0
)
# flash attn doesn't support head_dim bigger than 256, use flashinfer
if
self
.
mla_wrapper
is
None
:
self
.
mla_wrapper
=
MLAWrapperSingleton
.
get_instance
(
self
.
device
,
1
,
past_key_value
.
max_pages
,
use_cuda_graph
=
True
)
if
self
.
mla_wrapper
.
need_plan
:
self
.
mla_wrapper
.
need_plan
=
False
if
self
.
mla_wrapper
.
need_plan
:
self
.
mla_wrapper
.
need_plan
=
False
if
q_len
==
1
:
self
.
mla_wrapper
.
plan
(
None
,
None
,
None
,
position_ids
.
squeeze
(
1
)
+
1
,
self
.
num_heads
,
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
,
past_key_value
.
page_size
,
self
.
softmax_scale
,
q_nope
.
dtype
,
compressed_kv
.
dtype
)
position_ids
.
squeeze
(
1
)
+
1
,
self
.
num_heads
,
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
,
past_key_value
.
page_size
,
self
.
softmax_scale
,
q_nope
.
dtype
,
compressed_kv
.
dtype
)
else
:
qo_indptr
=
torch
.
tensor
([
0
,
q_len
],
dtype
=
torch
.
int32
,
device
=
self
.
device
)
kv_len_arr
=
torch
.
tensor
([
position_ids
[
0
,
-
1
].
item
()
+
1
],
dtype
=
torch
.
int32
,
device
=
self
.
device
)
self
.
mla_wrapper
.
plan
(
qo_indptr
,
None
,
None
,
kv_len_arr
,
self
.
num_heads
,
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
,
past_key_value
.
page_size
,
self
.
softmax_scale
,
q_nope
.
dtype
,
compressed_kv
.
dtype
)
attn_output
=
self
.
mla_wrapper
.
run
(
q_nope
,
q_pe
,
compressed_kv
,
k_pe
).
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
kv_lora_rank
)
"""
k = (
torch.cat([compressed_kv, k_pe], dim=-1)
...
...
@@ -443,10 +460,11 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
# out_absorb [self.num_heads, self.v_head_dim, self.kv_lora_rank]
attn_output
=
attn_output
.
transpose
(
1
,
2
)
# [bsz, self.num_heads, q_len, self.kv_lora_rank]
attn_output
=
torch
.
matmul
(
attn_output
,
out_absorb
.
mT
)
# [bsz, self.num_heads, q_len, self.v_head_dim]
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
# [bsz, q_len, self.num_heads, self.kv_lora_rank]
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
*
self
.
v_head_dim
)
# [bsz, q_len, self.num_heads * self.v_head_dim]
attn_output
=
self
.
o_proj
(
attn_output
)
return
attn_output
,
None
,
past_key_value
else
:
if
past_key_value
is
not
None
:
...
...
@@ -571,7 +589,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
cache_position
:
Optional
[
torch
.
LongTensor
]
=
None
,
**
kwargs
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
if
os
.
name
==
'nt'
:
if
os
.
name
==
'nt'
or
get_compute_capability
()
<
8
:
print
(
"for Windows or GPU before ampere, use forward_windows"
)
return
self
.
forward_windows
(
hidden_states
,
attention_mask
,
...
...
ktransformers/operators/experts.py
View file @
26f7b4af
...
...
@@ -245,7 +245,16 @@ class KExpertsCPU(KExpertsBase):
down_type
=
None
for
key
in
keys
:
if
key
+
".ffn_gate_exps.weight"
in
self
.
gguf_loader
.
tensor_info
:
if
self
.
gguf_loader
.
safetensor_loader
is
not
None
:
# using a temp ugly way to temprary load the tensor
gate
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_gate_exps.weight"
).
numpy
()
up
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_up_exps.weight"
).
numpy
()
down
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_down_exps.weight"
).
numpy
()
gate_type
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_gate_exps.ggml_type"
).
item
()
up_type
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_up_exps.ggml_type"
).
item
()
down_type
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_down_exps.ggml_type"
).
item
()
elif
key
+
".ffn_gate_exps.weight"
in
self
.
gguf_loader
.
tensor_info
:
gate
=
self
.
gguf_loader
.
get_mmap_tensor
(
key
+
".ffn_gate_exps.weight"
)
up
=
self
.
gguf_loader
.
get_mmap_tensor
(
key
+
".ffn_up_exps.weight"
)
down
=
self
.
gguf_loader
.
get_mmap_tensor
(
key
+
".ffn_down_exps.weight"
)
...
...
@@ -450,9 +459,9 @@ class KExpertsTorch(KExpertsBase):
self
.
up
[
i
]
=
w
[
"up"
][
i
,
...].
to
(
device
=
device
,
dtype
=
self
.
dtype
)
self
.
down
[
i
]
=
w
[
"down"
][
i
,
...].
to
(
device
=
device
,
dtype
=
self
.
dtype
)
self
.
up
=
torch
.
cat
(
self
.
up
,
dim
=
0
)
self
.
gate
=
torch
.
cat
(
self
.
gate
,
dim
=
0
)
self
.
down
=
torch
.
cat
(
self
.
down
,
dim
=
0
)
self
.
up
=
torch
.
stack
(
self
.
up
,
dim
=
0
)
self
.
gate
=
torch
.
stack
(
self
.
gate
,
dim
=
0
)
self
.
down
=
torch
.
stack
(
self
.
down
,
dim
=
0
)
return
def
unload
(
self
):
...
...
ktransformers/operators/flashinfer_wrapper.py
View file @
26f7b4af
...
...
@@ -9,7 +9,7 @@ flashinfer_enabled = False
try
:
import
flashinfer
flashinfer_enabled
=
False
# disabled now, TODO:use new version of flashinfer and enabl
e
flashinfer_enabled
=
Tru
e
print
(
"found flashinfer"
)
except
ImportError
:
...
...
@@ -122,7 +122,7 @@ class MLAWrapper():
if
kv_indices
is
None
:
assert
self
.
max_batch_size
==
1
kv_indices
=
self
.
kv_indices_buf
self
.
wrapper
.
plan
(
qo_indptr
,
kv_indptr
,
...
...
@@ -132,14 +132,14 @@ class MLAWrapper():
head_dim_ckv
,
head_dim_kpe
,
page_size
,
Fals
e
,
# causal
is False for decoding
Tru
e
,
# causal
sm_scale
,
q_data_type
,
kv_data_type
,
)
def
run
(
self
,
q_nope
,
q_pe
,
ckv
,
k_pe
,
return_lse
=
False
):
return
self
.
wrapper
.
run
(
q_nope
,
q_pe
,
ckv
,
k_pe
,
return_lse
)
return
self
.
wrapper
.
run
(
q_nope
,
q_pe
,
ckv
,
k_pe
,
return_lse
=
return_lse
)
class
MLAWrapperSingleton
():
wrappers
:
dict
=
{}
...
...
@@ -179,6 +179,24 @@ class MLAWrapperSingleton():
sm_scale
,
q_data_type
,
kv_data_type
,)
wrapper
.
need_plan
=
False
@
classmethod
def
need_plan_all
(
cls
):
for
device
,
wrapper
in
cls
.
wrappers
.
items
():
wrapper
.
need_plan
=
True
@
classmethod
def
reset_buffer
(
cls
):
for
device
,
wrapper
in
cls
.
wrappers
.
items
():
wrapper
.
qo_indptr_buf
[
1
]
=
1
# assert max_batch_size=1 here.
@
classmethod
def
update_buffer
(
cls
,
max_pages
):
for
device
,
wrapper
in
cls
.
wrappers
.
items
():
wrapper
.
kv_indptr_buf
[
1
]
=
max_pages
# assert max_batch_size=1 here.
wrapper
.
kv_indices_buf
=
torch
.
arange
(
0
,
max_pages
,
dtype
=
torch
.
int32
,
device
=
device
)
wrapper
.
wrapper
.
_kv_indices_buf
=
wrapper
.
kv_indices_buf
if
__name__
==
"__main__"
:
...
...
@@ -187,8 +205,9 @@ if __name__ == "__main__":
page_size
=
64
num_heads
=
128
q_nope
=
torch
.
randn
((
1
,
num_heads
,
512
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
q_pe
=
torch
.
randn
((
1
,
num_heads
,
64
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
q_len
=
10
q_nope
=
torch
.
randn
((
q_len
,
num_heads
,
512
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
q_pe
=
torch
.
randn
((
q_len
,
num_heads
,
64
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
ckv
=
torch
.
randn
((
max_pages
,
page_size
,
512
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
k_pe
=
torch
.
randn
((
max_pages
,
page_size
,
64
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)
...
...
@@ -199,10 +218,10 @@ if __name__ == "__main__":
max_pages
,
)
kv_len_arr
=
torch
.
tensor
([
10
],
dtype
=
torch
.
int32
,
device
=
"cuda"
)
kv_len_arr
=
torch
.
tensor
([
q_len
],
dtype
=
torch
.
int32
,
device
=
"cuda"
)
qo_indptr
=
torch
.
tensor
([
0
,
q_len
],
dtype
=
torch
.
int32
,
device
=
"cuda"
)
wrapper
.
plan
(
None
,
qo_indptr
,
None
,
None
,
kv_len_arr
,
...
...
@@ -216,6 +235,7 @@ if __name__ == "__main__":
)
attn_output
=
wrapper
.
run
(
q_nope
,
q_pe
,
ckv
,
k_pe
)
print
(
attn_output
.
shape
)
k
=
(
torch
.
cat
([
ckv
,
k_pe
],
dim
=-
1
)
...
...
@@ -235,6 +255,7 @@ if __name__ == "__main__":
False
,
192
**
(
-
0.5
)
)
print
(
attn_ref
.
shape
)
torch
.
testing
.
assert_close
(
attn_output
,
attn_ref
,
rtol
=
1e-3
,
atol
=
1e-3
)
print
(
"test past"
)
\ No newline at end of file
ktransformers/operators/gate.py
View file @
26f7b4af
...
...
@@ -67,7 +67,14 @@ class KMoEGateBase(ABC):
for
key
in
keys
:
key
=
"."
.
join
(
key
.
split
(
"."
)[:
-
1
])
if
key
+
".ffn_gate_inp.weight"
in
self
.
gguf_loader
.
tensor_info
:
if
self
.
gguf_loader
.
safetensor_loader
is
not
None
:
targets
=
[
".ffn_gate_inp.weight"
,
".exp_probs_b.bias"
]
weight
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".ffn_gate_inp.weight"
)
e_score_correction_bias
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
".exp_probs_b.bias"
)
weight_type
=
weight
.
dtype
e_score_correction_bias_type
=
e_score_correction_bias
.
dtype
res
=
{
"weight"
:
weight
,
"e_score_correction_bias"
:
e_score_correction_bias
,
"weight_type"
:
weight_type
,
"e_score_correction_bias_type"
:
e_score_correction_bias_type
}
elif
key
+
".ffn_gate_inp.weight"
in
self
.
gguf_loader
.
tensor_info
:
targets
=
[
".ffn_gate_inp.weight"
,
".exp_probs_b.bias"
]
tensors
=
self
.
load_multi
(
key
,
targets
,
device
=
device
)
weight
=
tensors
[
".ffn_gate_inp.weight"
]
...
...
@@ -116,8 +123,8 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
self
.
orig_module
.
e_score_correction_bias
=
nn
.
Parameter
(
w
[
"e_score_correction_bias"
])
else
:
raise
ValueError
(
"Invalid weight type"
)
self
.
orig_module
.
weight
=
self
.
orig_module
.
weight
.
to
(
device
)
self
.
orig_module
.
e_score_correction_bias
=
self
.
orig_module
.
e_score_correction_bias
.
to
(
device
)
self
.
orig_module
.
weight
=
nn
.
Parameter
(
self
.
orig_module
.
weight
.
to
(
device
)
)
self
.
orig_module
.
e_score_correction_bias
=
nn
.
Parameter
(
self
.
orig_module
.
e_score_correction_bias
.
to
(
device
)
)
def
unload
(
self
):
if
self
.
weight
is
not
None
:
...
...
ktransformers/operators/linear.py
View file @
26f7b4af
...
...
@@ -26,6 +26,7 @@ from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marl
)
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
transformers.configuration_utils
import
PretrainedConfig
from
ktransformers.ktransformers_ext.triton.fp8gemm
import
fp8_gemm
,
act_quant
,
weight_dequant
from
abc
import
ABC
,
abstractmethod
import
sys
,
os
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
))
...
...
@@ -78,7 +79,13 @@ class KLinearBase(ABC):
keys
=
[
self
.
key
]
for
key
in
keys
:
if
key
+
".weight"
in
self
.
gguf_loader
.
tensor_file_map
:
if
self
.
gguf_loader
.
safetensor_loader
is
not
None
:
# using safetensor_loader
tensor
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
'.weight'
)
weight_scale_inv
=
self
.
gguf_loader
.
safetensor_loader
.
load_tensor
(
key
+
'.weight_scale_inv'
)
return
nn
.
Parameter
(
tensor
),
nn
.
Parameter
(
weight_scale_inv
)
elif
key
+
".weight"
in
self
.
gguf_loader
.
tensor_file_map
:
if
key
+
".bias"
in
self
.
gguf_loader
.
tensor_file_map
:
tensors
=
self
.
load_multi
(
key
,
[
"weight"
,
"bias"
],
device
=
device
)
tensor
=
tensors
[
"weight"
]
...
...
@@ -169,7 +176,61 @@ class KLinearTorch(KLinearBase):
if
self
.
has_bias
:
self
.
bias
=
None
class
KLinearFP8
(
KLinearBase
):
# this kernel requires special handling for weight
# Please load the weight file downloaded from KVCache.AI
marlin_q_w
:
torch
.
Tensor
marlin_s
:
torch
.
Tensor
g_idx
:
torch
.
Tensor
sort_indices
:
torch
.
Tensor
has_bias
:
bool
weight
:
torch
.
Tensor
scale_w
:
torch
.
Tensor
bias
:
torch
.
Tensor
def
__init__
(
self
,
key
:
str
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
=
None
,
device
:
str
=
"cuda"
,
block_size
:
int
=
128
,
**
kwargs
,
):
super
().
__init__
(
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
self
.
has_bias
=
False
self
.
dtype
=
torch
.
get_default_dtype
()
self
.
block_size
=
block_size
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
x
.
to
(
self
.
device
)
orig_dtype
=
x
.
dtype
x_quantized
,
scale_x
=
act_quant
(
x
,
self
.
block_size
)
y
=
fp8_gemm
(
x_quantized
,
scale_x
,
self
.
weight
,
self
.
weight_scale_inv
)
return
y
.
to
(
dtype
=
orig_dtype
)
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
):
if
device
is
None
:
device
=
self
.
device
if
w
is
None
:
w
=
self
.
load_weight
(
device
=
device
)
### TODO fit weight_inv format
if
isinstance
(
w
,
tuple
):
self
.
weight
=
w
[
0
].
to
(
device
)
self
.
weight_scale_inv
=
w
[
1
].
to
(
device
)
self
.
has_bias
=
False
else
:
raise
ValueError
(
"Invalid weight type"
)
self
.
weight
=
self
.
weight
.
to
(
device
)
if
self
.
has_bias
:
self
.
bias
=
self
.
bias
.
to
(
device
)
def
unload
(
self
):
if
self
.
weight
is
not
None
:
self
.
weight
=
None
if
self
.
has_bias
:
self
.
bias
=
None
class
KLinearMarlin
(
KLinearBase
):
marlin_q_w
:
torch
.
Tensor
marlin_s
:
torch
.
Tensor
...
...
@@ -404,7 +465,8 @@ class KLinearCPUInfer(KLinearBase):
LINEAR_MAP
=
{
"KLinearMarlin"
:
KLinearMarlin
,
"KLinearTorch"
:
KLinearTorch
,
"KLinearCPUInfer"
:
KLinearCPUInfer
"KLinearCPUInfer"
:
KLinearCPUInfer
,
"KLinearFP8"
:
KLinearFP8
,
}
class
KTransformersLinear
(
BaseInjectedModule
,
KLinearBase
):
...
...
@@ -440,10 +502,11 @@ class KTransformersLinear(BaseInjectedModule, KLinearBase):
def
forward
(
self
,
x
):
if
self
.
mode
==
InferenceState
.
PREFILL
:
assert
self
.
prefill_linear
is
not
None
,
"cpu linear is not initialized"
return
self
.
prefill_linear
.
forward
(
x
)
y
=
self
.
prefill_linear
.
forward
(
x
)
else
:
assert
self
.
generate_linear
is
not
None
,
"gpu linear is not initialized"
return
self
.
generate_linear
.
forward
(
x
)
y
=
self
.
generate_linear
.
forward
(
x
)
return
y
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
mode
:
InferenceState
=
InferenceState
.
GENERATE
):
if
not
mode
:
...
...
ktransformers/operators/models.py
View file @
26f7b4af
...
...
@@ -56,7 +56,7 @@ from ktransformers.models.modeling_deepseek import (
from
transformers.models.qwen2_moe.configuration_qwen2_moe
import
Qwen2MoeConfig
from
ktransformers.models.configuration_llama
import
LlamaConfig
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.utils
import
InferenceState
,
get_compute_capability
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
transformers.configuration_utils
import
PretrainedConfig
from
ktransformers.models.modeling_llama
import
(
...
...
@@ -649,9 +649,14 @@ class KDeepseekV2Model(BaseInjectedModule):
if
per_layer_prefill_flag
:
causal_mask
=
None
else
:
causal_mask
=
self
.
_update_causal_mask
(
attention_mask
,
inputs_embeds
,
cache_position
,
past_key_values
,
output_attentions
)
if
os
.
name
==
'nt'
or
get_compute_capability
()
<
8
:
print
(
"for Windows or GPU before ampere, use forward_windows"
)
# only use mask in forward windows or can't flash attn
causal_mask
=
self
.
_update_causal_mask
(
attention_mask
,
inputs_embeds
,
cache_position
,
past_key_values
,
output_attentions
)
else
:
causal_mask
=
None
# embed positions
hidden_states
=
inputs_embeds
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
0 → 100644
View file @
26f7b4af
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearFP8"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-4.yaml
View file @
26f7b4af
...
...
@@ -293,6 +293,7 @@
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
absorb_for_prefill
:
False
# GPU 1: layers 15–29
-
match
:
...
...
@@ -302,6 +303,7 @@
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
absorb_for_prefill
:
False
# GPU 2: layers 30–44
-
match
:
...
...
@@ -311,6 +313,7 @@
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
absorb_for_prefill
:
False
# GPU 3: layers 45–60
-
match
:
...
...
@@ -320,6 +323,7 @@
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
absorb_for_prefill
:
False
# === Overall Model Replacement with Transfer Map ===
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
0 → 100644
View file @
26f7b4af
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
KLinearFP8"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearFP8"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
30
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu-marlin.yaml
View file @
26f7b4af
...
...
@@ -168,5 +168,5 @@
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:
0
"
prefill_device
:
"
cuda:
0
"
generate_device
:
"
cuda:
1
"
prefill_device
:
"
cuda:
1
"
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
View file @
26f7b4af
...
...
@@ -60,6 +60,7 @@
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
...
...
ktransformers/server/api/ollama/completions.py
View file @
26f7b4af
...
...
@@ -12,8 +12,8 @@ from ktransformers.server.config.config import Config
from
ktransformers.server.utils.create_interface
import
get_interface
from
ktransformers.server.schemas.assistants.streaming
import
check_link_response
from
ktransformers.server.backend.base
import
BackendInterfaceBase
router
=
APIRouter
(
prefix
=
'/api'
)
router
=
APIRouter
(
prefix
=
'/api'
)
# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
class
OllamaGenerateCompletionRequest
(
BaseModel
):
...
...
@@ -40,61 +40,121 @@ class OllamaGenerateCompletionRequest(BaseModel):
keep_alive
:
Optional
[
str
]
=
Field
(
"5m"
,
description
=
"Controls how long the model will stay loaded into memory following the request."
)
class
OllamaGenerationStreamResponse
(
BaseModel
):
model
:
str
created_at
:
str
response
:
str
done
:
bool
=
Field
(...)
class
OllamaGenerationResponse
(
BaseModel
):
pass
@
router
.
post
(
"/generate"
,
tags
=
[
'ollama'
])
async
def
generate
(
request
:
Request
,
input
:
OllamaGenerateCompletionRequest
):
id
=
str
(
uuid4
())
interface
:
BackendInterfaceBase
=
get_interface
()
print
(
f
'COMPLETION INPUT:----
\n
{
input
.
prompt
}
\n
----'
)
config
=
Config
()
if
input
.
stream
:
async
def
inner
():
async
for
token
in
interface
.
inference
(
input
.
prompt
,
id
):
d
=
OllamaGenerationStreamResponse
(
model
=
config
.
model_name
,
created_at
=
str
(
datetime
.
now
()),
response
=
token
,
done
=
False
)
yield
d
.
model_dump_json
()
+
'
\n
'
# d = {'model':config.model_name,'created_at':"", 'response':token,'done':False}
# yield f"{json.dumps(d)}\n"
# d = {'model':config.model_name,'created_at':"", 'response':'','done':True}
# yield f"{json.dumps(d)}\n"
d
=
OllamaGenerationStreamResponse
(
model
=
config
.
model_name
,
created_at
=
str
(
datetime
.
now
()),
response
=
''
,
done
=
True
)
yield
d
.
model_dump_json
()
+
'
\n
'
return
check_link_response
(
request
,
inner
())
async
for
token
in
interface
.
inference
(
input
.
prompt
,
id
):
d
=
OllamaGenerationStreamResponse
(
model
=
config
.
model_name
,
created_at
=
str
(
datetime
.
now
()),
response
=
token
,
done
=
False
)
yield
d
.
model_dump_json
()
+
'
\n
'
d
=
OllamaGenerationStreamResponse
(
model
=
config
.
model_name
,
created_at
=
str
(
datetime
.
now
()),
response
=
''
,
done
=
True
)
yield
d
.
model_dump_json
()
+
'
\n
'
return
check_link_response
(
request
,
inner
())
else
:
raise
NotImplementedError
# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-chat-completion
class
OllamaChatCompletionMessage
(
BaseModel
):
role
:
str
content
:
str
class
OllamaChatCompletionRequest
(
BaseModel
):
pass
model
:
str
=
Field
(...,
description
=
"The model name, which is required."
)
messages
:
List
[
OllamaChatCompletionMessage
]
=
Field
(
...,
description
=
"A list of messages to generate a response for."
)
stream
:
bool
=
Field
(
True
,
description
=
"If true, the response will be streamed."
)
class
OllamaChatCompletionStreamResponse
(
BaseModel
):
pass
model
:
str
created_at
:
str
message
:
dict
done
:
bool
=
Field
(...)
total_duration
:
Optional
[
int
]
=
Field
(
None
,
description
=
"Total time spent in nanoseconds"
)
load_duration
:
Optional
[
int
]
=
Field
(
None
,
description
=
"Time spent loading model in nanoseconds"
)
prompt_eval_count
:
Optional
[
int
]
=
Field
(
None
,
description
=
"Number of tokens in prompt"
)
prompt_eval_duration
:
Optional
[
int
]
=
Field
(
None
,
description
=
"Time spent evaluating prompt in nanoseconds"
)
eval_count
:
Optional
[
int
]
=
Field
(
None
,
description
=
"Number of tokens generated"
)
eval_duration
:
Optional
[
int
]
=
Field
(
None
,
description
=
"Time spent generating response in nanoseconds"
)
class
OllamaChatCompletionResponse
(
BaseModel
):
pass
@
router
.
post
(
"/chat"
,
tags
=
[
'ollama'
])
async
def
chat
(
request
:
Request
,
input
:
OllamaChatCompletionRequest
):
raise
NotImplementedError
id
=
str
(
uuid4
())
interface
:
BackendInterfaceBase
=
get_interface
()
config
=
Config
()
# 将消息转换为提示字符串
prompt
=
""
for
msg
in
input
.
messages
:
prompt
+=
f
"
{
msg
.
role
}
:
{
msg
.
content
}
\n
"
prompt
+=
"assistant:"
if
input
.
stream
:
async
def
inner
():
start_time
=
time
()
# 记录开始时间(秒)
eval_count
=
0
# 统计生成的 token 数量
tokens
=
[]
async
for
token
in
interface
.
inference
(
prompt
,
id
):
d
=
OllamaChatCompletionStreamResponse
(
model
=
config
.
model_name
,
created_at
=
str
(
datetime
.
now
()),
message
=
{
"role"
:
"assistant"
,
"content"
:
token
},
done
=
False
)
yield
d
.
model_dump_json
()
+
'
\n
'
# 计算性能数据
end_time
=
time
()
total_duration
=
int
((
end_time
-
start_time
)
*
1_000_000_000
)
# 转换为纳秒
prompt_eval_count
=
len
(
prompt
.
split
())
# 简单估算提示词数量
eval_duration
=
total_duration
# 假设全部时间用于生成(简化)
prompt_eval_duration
=
0
# 假设无单独提示评估时间
load_duration
=
0
# 假设加载时间未知
d
=
OllamaChatCompletionStreamResponse
(
model
=
config
.
model_name
,
created_at
=
str
(
datetime
.
now
()),
message
=
{},
done
=
True
,
total_duration
=
total_duration
,
load_duration
=
load_duration
,
prompt_eval_count
=
prompt_eval_count
,
prompt_eval_duration
=
prompt_eval_duration
,
eval_count
=
eval_count
,
eval_duration
=
eval_duration
)
yield
d
.
model_dump_json
()
+
'
\n
'
return
check_link_response
(
request
,
inner
())
else
:
raise
NotImplementedError
(
"Non-streaming chat is not implemented."
)
# https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models
class
OllamaModel
(
BaseModel
):
...
...
@@ -103,9 +163,8 @@ class OllamaModel(BaseModel):
size
:
int
# TODO: fill the rest correctly
# mock ollama
@
router
.
get
(
"/tags"
,
tags
=
[
'ollama'
])
@
router
.
get
(
"/tags"
,
tags
=
[
'ollama'
])
async
def
tags
():
config
=
Config
()
# TODO: fill this correctly, although it does not effect Tabby
...
...
@@ -138,25 +197,21 @@ class OllamaShowResponse(BaseModel):
class
Config
:
protected_namespaces
=
()
@
router
.
post
(
"/show"
,
tags
=
[
'ollama'
])
async
def
show
(
request
:
Request
,
input
:
OllamaShowRequest
):
config
=
Config
()
# TODO: Add more info in config to return, although it does not effect Tabby
return
OllamaShowResponse
(
modelfile
=
"# Modelfile generated by ..."
,
parameters
=
" "
,
template
=
" "
,
details
=
OllamaShowDetial
(
parent_model
=
" "
,
format
=
"gguf"
,
family
=
" "
,
families
=
[
" "
],
parameter_size
=
" "
,
quantization_level
=
" "
modelfile
=
"# Modelfile generated by ..."
,
parameters
=
" "
,
template
=
" "
,
details
=
OllamaShowDetial
(
parent_model
=
" "
,
format
=
"gguf"
,
family
=
" "
,
families
=
[
" "
],
parameter_size
=
" "
,
quantization_level
=
" "
),
model_info
=
OllamaModelInfo
()
model_info
=
OllamaModelInfo
()
)
\ No newline at end of file
ktransformers/server/api/openai/endpoints/chat.py
View file @
26f7b4af
...
...
@@ -25,6 +25,9 @@ async def chat_completion(request:Request,create:ChatCompletionCreate):
input_message
=
[
json
.
loads
(
m
.
model_dump_json
())
for
m
in
create
.
messages
]
if
Config
().
api_key
!=
''
:
assert
request
.
headers
.
get
(
'Authorization'
,
''
).
split
()[
-
1
]
==
Config
().
api_key
if
create
.
stream
:
async
def
inner
():
chunk
=
ChatCompletionChunk
(
id
=
id
,
object
=
'chat.completion.chunk'
,
created
=
int
(
time
()))
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment