Commit d61f4dc8 authored by chenzk's avatar chenzk
Browse files

v2.0

parent 2aab56dd
......@@ -28,8 +28,8 @@ pip install -r finetune/requirements.txt # finetune/requirements.txt
pip install deepspeed-0.12.3+git299681e.abi0.dtk2310.torch2.1.0a0-cp38-cp38-linux_x86_64.whl
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl
# xformers
tar -xvf xformers.tar
cd xformers
tar -xvf xformers-0.0.23.tar
cd xformers-0.0.23
pip install xformers==0.0.23 --no-deps
bash patch_xformers.rocm.sh
```
......@@ -43,8 +43,8 @@ docker run --shm-size=32G --name minicpm -v /opt/hyhal:/opt/hyhal --privileged=t
pip install deepspeed-0.12.3+git299681e.abi0.dtk2310.torch2.1.0a0-cp38-cp38-linux_x86_64.whl
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl
# xformers
tar -xvf xformers.tar
cd xformers
tar -xvf xformers-0.0.23.tar
cd xformers-0.0.23
pip install xformers==0.0.23 --no-deps
bash patch_xformers.rocm.sh
```
......@@ -68,8 +68,8 @@ xformers:0.0.23
pip install deepspeed-0.12.3+git299681e.abi0.dtk2310.torch2.1.0a0-cp38-cp38-linux_x86_64.whl
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl
# xformers
tar -xvf xformers.tar
cd xformers
tar -xvf xformers-0.0.23.tar
cd xformers-0.0.23
pip install xformers==0.0.23 --no-deps
bash patch_xformers.rocm.sh
```
......@@ -117,11 +117,45 @@ bash finetune/sft_finetune.sh # 全参数finetune,显存占用30245MiB。
## 推理
方法一:pytorch推理
```
python infer.py
# 若采用官方默认权重推理:代码里设置path = 'checkpoint/miniCPM-bf16'
```
方法二:vllm推理(更快)
1、vllm对环境有特殊要求,需要安装以下版本的库才可用,所需版本位于whl文件夹:
```
pip install vllm-0.2.2+git40eaf6d.abi0.dtk2310.torch2.1-cp38-cp38-linux_x86_64.whl
pip install transformers==4.35.2
# 安装xformers-0.0.22
tar -xvf xformers-0.0.22.tar
cd xformers
pip install xformers==0.0.22 --no-deps
bash patch_xformers-0.0.22.post7.rocm.sh
```
2、模型推理
```
cd MiniCPM
python inference/convert_hf_to_vllmcpm.py --load checkpoint/miniCPM-bf16 --save vllmcpm_repo_path
mv vllmcpm_repo_path inference/vllm/examples/infer_cpm/
python inference.py --model_path vllmcpm_repo_path --prompt_path prompts/prompt_demo.txt #目前不支持awq量化
```
方法三:fastllm推理
1、特殊环境配置方法:
```
pip install transformers==4.37.2
编译安装fastllm:https://developer.hpccube.com/codes/OpenDAS/fastllm
```
2、模型推理
```
python infer_fastllm.py
```
## result
```
#问题
......
......@@ -5,5 +5,6 @@ rouge_chinese>=1.0.3
jupyter>=1.0.0
datasets>=2.16.1
peft>=0.7.1
transformers==4.37.2
# deepspeed>=0.13.1
# flash_attn>=2.5.1
docker run -it --shm-size=32G -v $PWD/MiniCPM:/home/MiniCPM -v /opt/hyhal:/opt/hyhal --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name minicpm ffa1f63239fc bash
# python -m torch.utils.collect_env
......@@ -5,5 +5,6 @@ rouge_chinese>=1.0.3
jupyter>=1.0.0
datasets>=2.16.1
peft>=0.7.1
transformers==4.37.2
# deepspeed>=0.13.1
# flash_attn>=2.5.1
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
torch.manual_seed(0)
path = "output/AdvertiseGenLoRA_lora_finetune/xxx/checkpoint-3000" # xxx:系统时间路径
# path = 'checkpoint/miniCPM-bf16'
#path = "output/AdvertiseGenLoRA_lora_finetune/xxx/checkpoint-3000" # xxx:系统时间路径
path = 'checkpoint/miniCPM-bf16'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map='cuda', trust_remote_code=True)
start_time = time.time()
responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮?差距多少?", temperature=0.5, top_p=0.8, repetition_penalty=1.02)
print("infer time:", time.time() - start_time, "s")
print(responds)
import torch
from transformers import AutoTokenizer, LlamaTokenizerFast, AutoModelForCausalLM
path = 'checkpoint/miniCPM-bf16'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='cuda', trust_remote_code=True)
from fastllm_pytools import llm
llm.set_device_map("cpu")
model = llm.from_hf(model, tokenizer, dtype = "float16") # dtype支持 "float16", "int8", "int4"
print(model.response("<用户>山东省最高的山是哪座山, 它比黄山高还是矮?差距多少?<AI>", top_p=0.8, temperature=0.5, repeat_penalty=1.02))
import argparse
import json
import os
import shutil
from tqdm import tqdm
from collections import OrderedDict
import torch
def convert_model(config, ckpt):
# config
config_bmt = OrderedDict(
{
"_dtype": "bf16",
"activate_fn": "silu",
"architectures": [
"CPMDragonflyForCausalLM"
],
"model_type": "cpm_dragonfly",
"base": 10000,
"dim_ff": config['intermediate_size'],
"dim_head": config['hidden_size'] // config['num_attention_heads'],
"dim_model": config['hidden_size'],
"dim_model_base": 256,
"dropout_p": 0.0,
"eps": config['rms_norm_eps'],
"init_std": config['initializer_range'],
"num_heads": config['num_attention_heads'],
"num_kv_heads": config['num_key_value_heads'],
"num_layers": config['num_hidden_layers'],
"orig_max_length": 4096,
"pose_prob": 0.0,
"pose_scaling_factor": 1.0,
"qk_norm": False,
"rope_scaling_factor": 1,
"rope_scaling_type": "",
"scale": True,
"scale_depth": config['scale_depth'],
"scale_emb": config['scale_emb'],
"tie_lm_head": True,
"tp": 0,
"transformers_version": "4.35.0",
"vocab_size": config['vocab_size']
}
)
model_bmt = OrderedDict()
model_bmt["input_embedding.weight"] = ckpt['model.embed_tokens.weight'].contiguous()
model_bmt["encoder.output_layernorm.weight"] = ckpt['model.norm.weight'].contiguous()
for lnum in tqdm(range(config_bmt['num_layers'])):
hf_pfx = f"model.layers.{lnum}"
bmt_pfx = f"encoder.layers.{lnum}"
model_bmt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = ckpt[f"{hf_pfx}.input_layernorm.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = ckpt[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = ckpt[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = ckpt[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = ckpt[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = ckpt[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = ckpt[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = ckpt[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = ckpt[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
return config_bmt, model_bmt
def load_model_ckpt(args):
with open(os.path.join(args.load, "config.json"), 'r') as fin:
config = json.load(fin)
ckpt = torch.load(os.path.join(args.load, "pytorch_model.bin"))
os.makedirs(f"{args.save}", exist_ok=True)
# model and config
hf_config, hf_ckpt = convert_model(config, ckpt)
with open(os.path.join(args.save, "config.json"), 'w') as fout:
json.dump(hf_config, fout, indent=4)
torch.save(hf_ckpt, f"{args.save}/pytorch_model.pt")
# tokenizer
shutil.copyfile(f"{args.load}/tokenizer.json", f"{args.save}/tokenizer.json")
shutil.copyfile(f"{args.load}/tokenizer.model", f"{args.save}/tokenizer.model")
shutil.copyfile(f"{args.load}/special_tokens_map.json", f"{args.save}/special_tokens_map.json")
shutil.copyfile(f"{args.load}/tokenizer_config.json", f"{args.save}/tokenizer_config.json")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--load", type=str, default="")
parser.add_argument("--save", type=str, default="")
args = parser.parse_args()
load_model_ckpt(args)
......@@ -40,7 +40,8 @@ params_dict = {
sampling_params = SamplingParams(**params_dict)
# Create an LLM.
llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
# llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='float16')
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
for prompt in prompts:
......
This diff is collapsed.
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment