Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
0d7d2663
Commit
0d7d2663
authored
Mar 21, 2025
by
zhanggzh
Browse files
change setup version code and Add support for Z100/Z100L
parent
8442a745
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
90 additions
and
20 deletions
+90
-20
.setup.py.swp
.setup.py.swp
+0
-0
install_dcu.sh
install_dcu.sh
+16
-8
ktransformers/__init__.py
ktransformers/__init__.py
+4
-3
ktransformers/local_chat.py
ktransformers/local_chat.py
+4
-2
ktransformers/operators/attention.py
ktransformers/operators/attention.py
+6
-3
ktransformers/operators/models.py
ktransformers/operators/models.py
+6
-3
ktransformers/util/utils.py
ktransformers/util/utils.py
+12
-0
setup.py
setup.py
+42
-1
No files found.
.setup.py.swp
0 → 100644
View file @
0d7d2663
File added
install_dcu.sh
View file @
0d7d2663
#!/bin/bash
set
-e
set
-e
#
clear build dirs
#
清理构建目录和旧的分发文件
rm
-rf
build
rm
-rf
dist
rm
-rf
*
.egg-info
rm
-rf
ktransformers/ktransformers_ext/build
rm
-rf
ktransformers/ktransformers_ext/cuda/build
rm
-rf
ktransformers/ktransformers_ext/cuda/dist
rm
-rf
ktransformers/ktransformers_ext/cuda/
*
.egg-info
echo
"Installing python dependencies from requirements.txt"
pip
install
-r
requirements-local_chat.txt
echo
"初始化Git子模块..."
git submodule update
--init
--recursive
export
USE_FASTPT_CUDA
=
True
export
CMAKE_BUILD_PARALLEL_LEVEL
=
32
echo
"Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD
=
TRUE pip
install
.
--no-build-isolation
echo
"Installation completed successfully"
echo
"构建ktransformers wheel包"
mkdir
-p
dist
KTRANSFORMERS_FORCE_BUILD
=
TRUE pip wheel
.
-w
dist
--no-build-isolation
--no-deps
echo
"生成的wheel包位于:"
ls
-l
dist/
*
.whl
echo
"构建成功!wheel包已生成在dist目录"
ktransformers/__init__.py
View file @
0d7d2663
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Description :
Author : kkk1nak0
Date : 2024-08-15 07:34:46
Version : 1.0.0
LastEditors : chenxl
LastEditors : chenxl
LastEditTime : 2025-02-15 03:53:02
'''
__version__
=
"0.2.3.post1"
__version__
=
"0.2.3post1"
__hcu_version__
=
'0.2.3post1+das.dtk2504'
ktransformers/local_chat.py
View file @
0d7d2663
...
...
@@ -28,7 +28,8 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from
ktransformers.models.modeling_deepseek_v3
import
DeepseekV3ForCausalLM
from
ktransformers.models.modeling_llama
import
LlamaForCausalLM
from
ktransformers.models.modeling_mixtral
import
MixtralForCausalLM
from
ktransformers.util.utils
import
prefill_and_generate
,
get_compute_capability
#from ktransformers.util.utils import prefill_and_generate, get_compute_capability
from
ktransformers.util.utils
import
prefill_and_generate
,
get_compute_capability
,
get_device_name
from
ktransformers.server.config.config
import
Config
from
ktransformers.operators.flashinfer_wrapper
import
flashinfer_enabled
...
...
@@ -169,7 +170,8 @@ def local_chat(
assert
Config
().
long_context_config
[
'max_seq_len'
]
>
input_tensor
.
shape
[
1
]
+
max_new_tokens
,
\
"please change max_seq_len in ~/.ktransformers/config.yaml"
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
config
.
architectures
[
0
]
==
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
and
get_compute_capability
()
>=
8
:
#if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
if
system
!=
"Windows"
and
(
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
"DeepseekV3ForCausalLM"
)
and
flashinfer_enabled
and
(
get_compute_capability
()
>=
8
or
(
"Z100"
in
get_device_name
())
or
(
"Z100L"
in
get_device_name
())):
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
,
mode
=
mode
,
force_think
=
force_think
,
chunk_prefill_size
=
chunk_prefill_size
,
use_flashinfer_mla
=
True
,
num_heads
=
config
.
num_attention_heads
,
head_dim_ckv
=
config
.
kv_lora_rank
,
head_dim_kpe
=
config
.
qk_rope_head_dim
,
q_head_dim
=
config
.
qk_rope_head_dim
+
config
.
qk_nope_head_dim
...
...
ktransformers/operators/attention.py
View file @
0d7d2663
...
...
@@ -16,7 +16,8 @@ from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_ro
from
typing
import
Optional
,
Tuple
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.utils
import
get_compute_capability
#from ktransformers.util.utils import get_compute_capability
from
ktransformers.util.utils
import
get_compute_capability
,
get_device_name
import
logging
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.cache_utils
import
Cache
...
...
@@ -589,8 +590,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
cache_position
:
Optional
[
torch
.
LongTensor
]
=
None
,
**
kwargs
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
if
os
.
name
==
'nt'
or
get_compute_capability
()
<
8
:
print
(
"for Windows or GPU before ampere, use forward_windows"
)
#if os.name == 'nt' or get_compute_capability()<8:
#print("for Windows or GPU before ampere, use forward_windows")
if
os
.
name
==
'nt'
or
get_compute_capability
()
<
8
or
(
"Z100"
in
get_device_name
())
or
(
"Z100L"
in
get_device_name
()):
print
(
"for Windows or GPU before ampere or Z100/Z100L, use forward_windows"
)
return
self
.
forward_windows
(
hidden_states
,
attention_mask
,
...
...
ktransformers/operators/models.py
View file @
0d7d2663
...
...
@@ -56,7 +56,8 @@ from ktransformers.models.modeling_deepseek import (
from
transformers.models.qwen2_moe.configuration_qwen2_moe
import
Qwen2MoeConfig
from
ktransformers.models.configuration_llama
import
LlamaConfig
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.utils
import
InferenceState
,
get_compute_capability
#from ktransformers.util.utils import InferenceState, get_compute_capability
from
ktransformers.util.utils
import
InferenceState
,
get_compute_capability
,
get_device_name
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
transformers.configuration_utils
import
PretrainedConfig
from
ktransformers.models.modeling_llama
import
(
...
...
@@ -649,8 +650,10 @@ class KDeepseekV2Model(BaseInjectedModule):
if
per_layer_prefill_flag
:
causal_mask
=
None
else
:
if
os
.
name
==
'nt'
or
get_compute_capability
()
<
8
:
print
(
"for Windows or GPU before ampere, use forward_windows"
)
#if os.name == 'nt' or get_compute_capability()<8:
# print("for Windows or GPU before ampere, use forward_windows")
if
os
.
name
==
'nt'
or
get_compute_capability
()
<
8
or
(
"Z100"
in
get_device_name
())
or
(
"Z100L"
in
get_device_name
()):
print
(
"for Windows or GPU before ampere or Z100/Z100L, use forward_windows"
)
# only use mask in forward windows or can't flash attn
causal_mask
=
self
.
_update_causal_mask
(
attention_mask
,
inputs_embeds
,
cache_position
,
past_key_values
,
output_attentions
...
...
ktransformers/util/utils.py
View file @
0d7d2663
...
...
@@ -33,6 +33,18 @@ def get_compute_capability(device:torch.device = None):
else
:
return
torch
.
cuda
.
get_device_properties
(
device
)
def
get_device_name
(
device
:
torch
.
device
=
None
):
if
torch
.
cuda
.
is_available
():
if
device
is
None
:
num_gpus
=
torch
.
cuda
.
device_count
()
gpu_name
=
[]
for
gpu_id
in
range
(
num_gpus
):
gpu_name
.
append
(
torch
.
cuda
.
get_device_name
(
gpu_id
))
return
gpu_name
else
:
return
torch
.
cuda
.
get_device_name
(
device
)
def
set_module
(
model
,
submodule_key
,
module
):
tokens
=
submodule_key
.
split
(
'.'
)
sub_tokens
=
tokens
[:
-
1
]
...
...
setup.py
View file @
0d7d2663
...
...
@@ -377,8 +377,49 @@ elif MUSA_HOME is not None:
else
:
raise
ValueError
(
"Unsupported backend: CUDA_HOME and MUSA_HOME are not set."
)
ROCM_PATH
=
os
.
getenv
(
'ROCM_PATH'
)
dtk_path
=
ROCM_PATH
+
'/.info/rocm_version'
with
open
(
dtk_path
,
'r'
)
as
file
:
content
=
file
.
read
().
strip
()
dtk_version
=
content
.
replace
(
'.'
,
''
)
print
(
dtk_version
)
cwd
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
ver_path
=
os
.
path
.
join
(
cwd
,
"ktransformers"
,
"__init__.py"
)
with
open
(
ver_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
for
line
in
file
:
match
=
re
.
search
(
r
'^__version__\s*=\s*["\'](.*?)["\']'
,
line
)
if
match
:
k_version
=
match
.
group
(
1
)
break
else
:
raise
RuntimeError
(
"未找到 __version__ 信息"
)
with
open
(
ver_path
,
'r'
)
as
f
:
lines
=
f
.
readlines
()
# 检查是否存在 __hcu_version__
found
=
False
new_lines
=
[]
for
line
in
lines
:
if
line
.
startswith
(
"__hcu_version__"
):
# 替换已有的 __hcu_version__
version
=
k_version
+
'+das.dtk'
+
dtk_version
new_lines
.
append
(
f
"__hcu_version__ = '
{
version
}
'
\n
"
)
found
=
True
else
:
new_lines
.
append
(
line
)
# 如果未找到 __hcu_version__,则追加到文件末尾
if
not
found
:
version
=
k_version
+
'+das.dtk'
+
dtk_version
new_lines
.
append
(
f
"__hcu_version__ = '
{
version
}
'
\n
"
)
# 写回文件
with
open
(
ver_path
,
'w'
)
as
f
:
f
.
writelines
(
new_lines
)
setup
(
version
=
V
ersion
Info
().
get_package
_version
()
,
version
=
k_v
ersion
+
'+das.dtk'
+
dtk
_version
,
cmdclass
=
{
"bdist_wheel"
:
BuildWheelsCommand
,
"build_ext"
:
CMakeBuild
},
ext_modules
=
[
CMakeExtension
(
"cpuinfer_ext"
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment