Commit 0d7d2663 authored by zhanggzh's avatar zhanggzh
Browse files

change setup version code and Add support for Z100/Z100L

parent 8442a745
File added
#!/bin/bash
set -e
set -e
# clear build dirs
# 清理构建目录和旧的分发文件
rm -rf build
rm -rf dist
rm -rf *.egg-info
rm -rf ktransformers/ktransformers_ext/build
rm -rf ktransformers/ktransformers_ext/cuda/build
rm -rf ktransformers/ktransformers_ext/cuda/dist
rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
echo "初始化Git子模块..."
git submodule update --init --recursive
export USE_FASTPT_CUDA=True
export CMAKE_BUILD_PARALLEL_LEVEL=32
echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation
echo "Installation completed successfully"
echo "构建ktransformers wheel包"
mkdir -p dist
KTRANSFORMERS_FORCE_BUILD=TRUE pip wheel . -w dist --no-build-isolation --no-deps
echo "生成的wheel包位于:"
ls -l dist/*.whl
echo "构建成功!wheel包已生成在dist目录"
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Description :
Author : kkk1nak0
Date : 2024-08-15 07:34:46
Version : 1.0.0
LastEditors : chenxl
LastEditors : chenxl
LastEditTime : 2025-02-15 03:53:02
'''
__version__ = "0.2.3.post1"
__version__ = "0.2.3post1"
__hcu_version__ = '0.2.3post1+das.dtk2504'
......@@ -28,7 +28,8 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate, get_compute_capability
#from ktransformers.util.utils import prefill_and_generate, get_compute_capability
from ktransformers.util.utils import prefill_and_generate, get_compute_capability, get_device_name
from ktransformers.server.config.config import Config
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
......@@ -169,7 +170,8 @@ def local_chat(
assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
"please change max_seq_len in ~/.ktransformers/config.yaml"
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
#if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM") and flashinfer_enabled and (get_compute_capability() >= 8 or ("Z100" in get_device_name()) or ("Z100L" in get_device_name())):
generated = prefill_and_generate(
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
......
......@@ -16,7 +16,8 @@ from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_ro
from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.utils import get_compute_capability
#from ktransformers.util.utils import get_compute_capability
from ktransformers.util.utils import get_compute_capability, get_device_name
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers.cache_utils import Cache
......@@ -589,8 +590,10 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if os.name == 'nt' or get_compute_capability()<8:
print("for Windows or GPU before ampere, use forward_windows")
#if os.name == 'nt' or get_compute_capability()<8:
#print("for Windows or GPU before ampere, use forward_windows")
if os.name == 'nt' or get_compute_capability()<8 or ("Z100" in get_device_name()) or ("Z100L" in get_device_name()):
print("for Windows or GPU before ampere or Z100/Z100L, use forward_windows")
return self.forward_windows(
hidden_states,
attention_mask,
......
......@@ -56,7 +56,8 @@ from ktransformers.models.modeling_deepseek import (
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState, get_compute_capability
#from ktransformers.util.utils import InferenceState, get_compute_capability
from ktransformers.util.utils import InferenceState, get_compute_capability, get_device_name
from ktransformers.util.custom_gguf import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import (
......@@ -649,8 +650,10 @@ class KDeepseekV2Model(BaseInjectedModule):
if per_layer_prefill_flag:
causal_mask = None
else:
if os.name == 'nt' or get_compute_capability()<8:
print("for Windows or GPU before ampere, use forward_windows")
#if os.name == 'nt' or get_compute_capability()<8:
# print("for Windows or GPU before ampere, use forward_windows")
if os.name == 'nt' or get_compute_capability()<8 or ("Z100" in get_device_name()) or ("Z100L" in get_device_name()):
print("for Windows or GPU before ampere or Z100/Z100L, use forward_windows")
# only use mask in forward windows or can't flash attn
causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
......
......@@ -33,6 +33,18 @@ def get_compute_capability(device:torch.device = None):
else:
return torch.cuda.get_device_properties(device)
def get_device_name(device:torch.device = None):
if torch.cuda.is_available():
if device is None:
num_gpus = torch.cuda.device_count()
gpu_name = []
for gpu_id in range(num_gpus):
gpu_name.append(torch.cuda.get_device_name(gpu_id))
return gpu_name
else:
return torch.cuda.get_device_name(device)
def set_module(model, submodule_key, module):
tokens = submodule_key.split('.')
sub_tokens = tokens[:-1]
......
......@@ -377,8 +377,49 @@ elif MUSA_HOME is not None:
else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
ROCM_PATH = os.getenv('ROCM_PATH')
dtk_path = ROCM_PATH + '/.info/rocm_version'
with open(dtk_path, 'r') as file:
content = file.read().strip()
dtk_version = content.replace('.', '')
print(dtk_version)
cwd = os.path.dirname(os.path.abspath(__file__))
ver_path = os.path.join(cwd, "ktransformers", "__init__.py")
with open(ver_path, "r", encoding="utf-8") as file:
for line in file:
match = re.search(r'^__version__\s*=\s*["\'](.*?)["\']', line)
if match:
k_version = match.group(1)
break
else:
raise RuntimeError("未找到 __version__ 信息")
with open(ver_path, 'r') as f:
lines = f.readlines()
# 检查是否存在 __hcu_version__
found = False
new_lines = []
for line in lines:
if line.startswith("__hcu_version__"):
# 替换已有的 __hcu_version__
version = k_version + '+das.dtk' + dtk_version
new_lines.append(f"__hcu_version__ = '{version}'\n")
found = True
else:
new_lines.append(line)
# 如果未找到 __hcu_version__,则追加到文件末尾
if not found:
version = k_version + '+das.dtk' + dtk_version
new_lines.append(f"__hcu_version__ = '{version}'\n")
# 写回文件
with open(ver_path, 'w') as f:
f.writelines(new_lines)
setup(
version=VersionInfo().get_package_version(),
version=k_version + '+das.dtk' + dtk_version,
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
ext_modules=[
CMakeExtension("cpuinfer_ext"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment