Commit 0d7d2663 authored by zhanggzh's avatar zhanggzh
Browse files

change setup version code and Add support for Z100/Z100L

parent 8442a745
Pipeline #2570 canceled with stages
File added
#!/bin/bash #!/bin/bash
set -e set -e
# clear build dirs # 清理构建目录和旧的分发文件
rm -rf build rm -rf build
rm -rf dist
rm -rf *.egg-info rm -rf *.egg-info
rm -rf ktransformers/ktransformers_ext/build rm -rf ktransformers/ktransformers_ext/build
rm -rf ktransformers/ktransformers_ext/cuda/build rm -rf ktransformers/ktransformers_ext/cuda/build
rm -rf ktransformers/ktransformers_ext/cuda/dist rm -rf ktransformers/ktransformers_ext/cuda/dist
rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
echo "初始化Git子模块..."
echo "Installing python dependencies from requirements.txt" git submodule update --init --recursive
pip install -r requirements-local_chat.txt
export USE_FASTPT_CUDA=True export USE_FASTPT_CUDA=True
export CMAKE_BUILD_PARALLEL_LEVEL=32 export CMAKE_BUILD_PARALLEL_LEVEL=32
echo "Installing ktransformers" echo "构建ktransformers wheel包"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation
echo "Installation completed successfully" mkdir -p dist
KTRANSFORMERS_FORCE_BUILD=TRUE pip wheel . -w dist --no-build-isolation --no-deps
echo "生成的wheel包位于:"
ls -l dist/*.whl
echo "构建成功!wheel包已生成在dist目录"
#!/usr/bin/env python #!/usr/bin/env python
# coding=utf-8 # coding=utf-8
''' '''
Description : Description :
Author : kkk1nak0 Author : kkk1nak0
Date : 2024-08-15 07:34:46 Date : 2024-08-15 07:34:46
Version : 1.0.0 Version : 1.0.0
LastEditors : chenxl LastEditors : chenxl
LastEditTime : 2025-02-15 03:53:02 LastEditTime : 2025-02-15 03:53:02
''' '''
__version__ = "0.2.3.post1" __version__ = "0.2.3post1"
__hcu_version__ = '0.2.3post1+das.dtk2504'
...@@ -28,7 +28,8 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM ...@@ -28,7 +28,8 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate, get_compute_capability #from ktransformers.util.utils import prefill_and_generate, get_compute_capability
from ktransformers.util.utils import prefill_and_generate, get_compute_capability, get_device_name
from ktransformers.server.config.config import Config from ktransformers.server.config.config import Config
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
...@@ -169,7 +170,8 @@ def local_chat( ...@@ -169,7 +170,8 @@ def local_chat(
assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \ assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
"please change max_seq_len in ~/.ktransformers/config.yaml" "please change max_seq_len in ~/.ktransformers/config.yaml"
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8: #if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM") and flashinfer_enabled and (get_compute_capability() >= 8 or ("Z100" in get_device_name()) or ("Z100L" in get_device_name())):
generated = prefill_and_generate( generated = prefill_and_generate(
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size, model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
......
...@@ -16,7 +16,8 @@ from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_ro ...@@ -16,7 +16,8 @@ from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_ro
from typing import Optional, Tuple from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.utils import get_compute_capability #from ktransformers.util.utils import get_compute_capability
from ktransformers.util.utils import get_compute_capability, get_device_name
import logging import logging
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from transformers.cache_utils import Cache from transformers.cache_utils import Cache
...@@ -589,8 +590,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): ...@@ -589,8 +590,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
cache_position: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None,
**kwargs, **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if os.name == 'nt' or get_compute_capability()<8: #if os.name == 'nt' or get_compute_capability()<8:
print("for Windows or GPU before ampere, use forward_windows") #print("for Windows or GPU before ampere, use forward_windows")
if os.name == 'nt' or get_compute_capability()<8 or ("Z100" in get_device_name()) or ("Z100L" in get_device_name()):
print("for Windows or GPU before ampere or Z100/Z100L, use forward_windows")
return self.forward_windows( return self.forward_windows(
hidden_states, hidden_states,
attention_mask, attention_mask,
......
...@@ -56,7 +56,8 @@ from ktransformers.models.modeling_deepseek import ( ...@@ -56,7 +56,8 @@ from ktransformers.models.modeling_deepseek import (
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState, get_compute_capability #from ktransformers.util.utils import InferenceState, get_compute_capability
from ktransformers.util.utils import InferenceState, get_compute_capability, get_device_name
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_gguf import GGUFLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import ( from ktransformers.models.modeling_llama import (
...@@ -649,8 +650,10 @@ class KDeepseekV2Model(BaseInjectedModule): ...@@ -649,8 +650,10 @@ class KDeepseekV2Model(BaseInjectedModule):
if per_layer_prefill_flag: if per_layer_prefill_flag:
causal_mask = None causal_mask = None
else: else:
if os.name == 'nt' or get_compute_capability()<8: #if os.name == 'nt' or get_compute_capability()<8:
print("for Windows or GPU before ampere, use forward_windows") # print("for Windows or GPU before ampere, use forward_windows")
if os.name == 'nt' or get_compute_capability()<8 or ("Z100" in get_device_name()) or ("Z100L" in get_device_name()):
print("for Windows or GPU before ampere or Z100/Z100L, use forward_windows")
# only use mask in forward windows or can't flash attn # only use mask in forward windows or can't flash attn
causal_mask = self._update_causal_mask( causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
......
...@@ -33,6 +33,18 @@ def get_compute_capability(device:torch.device = None): ...@@ -33,6 +33,18 @@ def get_compute_capability(device:torch.device = None):
else: else:
return torch.cuda.get_device_properties(device) return torch.cuda.get_device_properties(device)
def get_device_name(device:torch.device = None):
if torch.cuda.is_available():
if device is None:
num_gpus = torch.cuda.device_count()
gpu_name = []
for gpu_id in range(num_gpus):
gpu_name.append(torch.cuda.get_device_name(gpu_id))
return gpu_name
else:
return torch.cuda.get_device_name(device)
def set_module(model, submodule_key, module): def set_module(model, submodule_key, module):
tokens = submodule_key.split('.') tokens = submodule_key.split('.')
sub_tokens = tokens[:-1] sub_tokens = tokens[:-1]
......
...@@ -377,8 +377,49 @@ elif MUSA_HOME is not None: ...@@ -377,8 +377,49 @@ elif MUSA_HOME is not None:
else: else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.") raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
ROCM_PATH = os.getenv('ROCM_PATH')
dtk_path = ROCM_PATH + '/.info/rocm_version'
with open(dtk_path, 'r') as file:
content = file.read().strip()
dtk_version = content.replace('.', '')
print(dtk_version)
cwd = os.path.dirname(os.path.abspath(__file__))
ver_path = os.path.join(cwd, "ktransformers", "__init__.py")
with open(ver_path, "r", encoding="utf-8") as file:
for line in file:
match = re.search(r'^__version__\s*=\s*["\'](.*?)["\']', line)
if match:
k_version = match.group(1)
break
else:
raise RuntimeError("未找到 __version__ 信息")
with open(ver_path, 'r') as f:
lines = f.readlines()
# 检查是否存在 __hcu_version__
found = False
new_lines = []
for line in lines:
if line.startswith("__hcu_version__"):
# 替换已有的 __hcu_version__
version = k_version + '+das.dtk' + dtk_version
new_lines.append(f"__hcu_version__ = '{version}'\n")
found = True
else:
new_lines.append(line)
# 如果未找到 __hcu_version__,则追加到文件末尾
if not found:
version = k_version + '+das.dtk' + dtk_version
new_lines.append(f"__hcu_version__ = '{version}'\n")
# 写回文件
with open(ver_path, 'w') as f:
f.writelines(new_lines)
setup( setup(
version=VersionInfo().get_package_version(), version=k_version + '+das.dtk' + dtk_version,
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
ext_modules=[ ext_modules=[
CMakeExtension("cpuinfer_ext"), CMakeExtension("cpuinfer_ext"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment