Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
8442a745
Commit
8442a745
authored
Mar 14, 2025
by
zhanggzh
Browse files
Add support HYGON DCU
parent
4f22d726
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
114 additions
and
11 deletions
+114
-11
install_dcu.sh
install_dcu.sh
+20
-0
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
...ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
+1
-1
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-dcu.yaml
...formers/optimize/optimize_rules/DeepSeek-V3-Chat-dcu.yaml
+76
-0
setup.py
setup.py
+17
-10
No files found.
install_dcu.sh
0 → 100644
View file @
8442a745
#!/bin/bash
set
-e
# clear build dirs
rm
-rf
build
rm
-rf
*
.egg-info
rm
-rf
ktransformers/ktransformers_ext/build
rm
-rf
ktransformers/ktransformers_ext/cuda/build
rm
-rf
ktransformers/ktransformers_ext/cuda/dist
rm
-rf
ktransformers/ktransformers_ext/cuda/
*
.egg-info
echo
"Installing python dependencies from requirements.txt"
pip
install
-r
requirements-local_chat.txt
export
USE_FASTPT_CUDA
=
True
export
CMAKE_BUILD_PARALLEL_LEVEL
=
32
echo
"Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD
=
TRUE pip
install
.
--no-build-isolation
echo
"Installation completed successfully"
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
View file @
8442a745
...
...
@@ -55,7 +55,7 @@ class ScalarType<nv_bfloat16> {
using
FragC
=
Vec
<
float
,
4
>
;
using
FragS
=
Vec
<
nv_bfloat162
,
1
>
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|| KTRANSFORMERS_USE_DTK
static
__device__
float
inline
num2float
(
const
nv_bfloat16
x
)
{
return
__bfloat162float
(
x
);
}
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-dcu.yaml
0 → 100644
View file @
8442a745
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearTorch"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
setup.py
View file @
8442a745
...
...
@@ -328,22 +328,29 @@ class CMakeBuild(BuildExtension):
[
"cmake"
,
"--build"
,
"."
,
"--verbose"
,
*
build_args
],
cwd
=
build_temp
,
check
=
True
)
USE_FASTPT_CUDA
=
os
.
getenv
(
'USE_FASTPT_CUDA'
,
'False'
).
lower
()
==
'true'
if
CUDA_HOME
is
not
None
:
ops_module
=
CUDAExtension
(
'KTransformersOps'
,
[
extra_nvcc_flags
=
[
'-O3'
,
'--use_fast_math'
,
'-Xcompiler'
,
'-fPIC'
,
'-DKTRANSFORMERS_USE_CUDA'
,
]
if
USE_FASTPT_CUDA
:
extra_nvcc_flags
.
append
(
'-DKTRANSFORMERS_USE_DTK'
)
ops_module
=
CUDAExtension
(
'KTransformersOps'
,
[
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu'
,
'ktransformers/ktransformers_ext/cuda/binding.cpp'
,
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
,
'-DKTRANSFORMERS_USE_CUDA'
],
'nvcc'
:
[
'-O3'
,
'--use_fast_math'
,
'-Xcompiler'
,
'-fPIC'
,
'-DKTRANSFORMERS_USE_CUDA'
,
]
}
)
'cxx'
:
[
'-O3'
,
'-DKTRANSFORMERS_USE_CUDA'
],
'nvcc'
:
extra_nvcc_flags
}
)
elif
MUSA_HOME
is
not
None
:
SimplePorting
(
cuda_dir_path
=
"ktransformers/ktransformers_ext/cuda"
,
mapping_rule
=
{
# Common rules
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment