Commit d61f4dc8 authored by chenzk's avatar chenzk
Browse files

v2.0

parent 2aab56dd
......@@ -28,8 +28,8 @@ pip install -r finetune/requirements.txt # finetune/requirements.txt
pip install deepspeed-0.12.3+git299681e.abi0.dtk2310.torch2.1.0a0-cp38-cp38-linux_x86_64.whl
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl
# xformers
tar -xvf xformers.tar
cd xformers
tar -xvf xformers-0.0.23.tar
cd xformers-0.0.23
pip install xformers==0.0.23 --no-deps
bash patch_xformers.rocm.sh
```
......@@ -43,8 +43,8 @@ docker run --shm-size=32G --name minicpm -v /opt/hyhal:/opt/hyhal --privileged=t
pip install deepspeed-0.12.3+git299681e.abi0.dtk2310.torch2.1.0a0-cp38-cp38-linux_x86_64.whl
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl
# xformers
tar -xvf xformers.tar
cd xformers
tar -xvf xformers-0.0.23.tar
cd xformers-0.0.23
pip install xformers==0.0.23 --no-deps
bash patch_xformers.rocm.sh
```
......@@ -68,8 +68,8 @@ xformers:0.0.23
pip install deepspeed-0.12.3+git299681e.abi0.dtk2310.torch2.1.0a0-cp38-cp38-linux_x86_64.whl
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl
# xformers
tar -xvf xformers.tar
cd xformers
tar -xvf xformers-0.0.23.tar
cd xformers-0.0.23
pip install xformers==0.0.23 --no-deps
bash patch_xformers.rocm.sh
```
......@@ -117,11 +117,45 @@ bash finetune/sft_finetune.sh # 全参数finetune,显存占用30245MiB。
## 推理
方法一:pytorch推理
```
python infer.py
# 若采用官方默认权重推理:代码里设置path = 'checkpoint/miniCPM-bf16'
```
方法二:vllm推理(更快)
1、vllm对环境有特殊要求,需要安装以下版本的库才可用,所需版本位于whl文件夹:
```
pip install vllm-0.2.2+git40eaf6d.abi0.dtk2310.torch2.1-cp38-cp38-linux_x86_64.whl
pip install transformers==4.35.2
# 安装xformers-0.0.22
tar -xvf xformers-0.0.22.tar
cd xformers
pip install xformers==0.0.22 --no-deps
bash patch_xformers-0.0.22.post7.rocm.sh
```
2、模型推理
```
cd MiniCPM
python inference/convert_hf_to_vllmcpm.py --load checkpoint/miniCPM-bf16 --save vllmcpm_repo_path
mv vllmcpm_repo_path inference/vllm/examples/infer_cpm/
python inference.py --model_path vllmcpm_repo_path --prompt_path prompts/prompt_demo.txt #目前不支持awq量化
```
方法三:fastllm推理
1、特殊环境配置方法:
```
pip install transformers==4.37.2
编译安装fastllm:https://developer.hpccube.com/codes/OpenDAS/fastllm
```
2、模型推理
```
python infer_fastllm.py
```
## result
```
#问题
......
......@@ -5,5 +5,6 @@ rouge_chinese>=1.0.3
jupyter>=1.0.0
datasets>=2.16.1
peft>=0.7.1
transformers==4.37.2
# deepspeed>=0.13.1
# flash_attn>=2.5.1
docker run -it --shm-size=32G -v $PWD/MiniCPM:/home/MiniCPM -v /opt/hyhal:/opt/hyhal --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name minicpm ffa1f63239fc bash
# python -m torch.utils.collect_env
......@@ -5,5 +5,6 @@ rouge_chinese>=1.0.3
jupyter>=1.0.0
datasets>=2.16.1
peft>=0.7.1
transformers==4.37.2
# deepspeed>=0.13.1
# flash_attn>=2.5.1
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
torch.manual_seed(0)
path = "output/AdvertiseGenLoRA_lora_finetune/xxx/checkpoint-3000" # xxx:系统时间路径
# path = 'checkpoint/miniCPM-bf16'
#path = "output/AdvertiseGenLoRA_lora_finetune/xxx/checkpoint-3000" # xxx:系统时间路径
path = 'checkpoint/miniCPM-bf16'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map='cuda', trust_remote_code=True)
start_time = time.time()
responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮?差距多少?", temperature=0.5, top_p=0.8, repetition_penalty=1.02)
print("infer time:", time.time() - start_time, "s")
print(responds)
import torch
from transformers import AutoTokenizer, LlamaTokenizerFast, AutoModelForCausalLM
path = 'checkpoint/miniCPM-bf16'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='cuda', trust_remote_code=True)
from fastllm_pytools import llm
llm.set_device_map("cpu")
model = llm.from_hf(model, tokenizer, dtype = "float16") # dtype支持 "float16", "int8", "int4"
print(model.response("<用户>山东省最高的山是哪座山, 它比黄山高还是矮?差距多少?<AI>", top_p=0.8, temperature=0.5, repeat_penalty=1.02))
import argparse
import json
import os
import shutil
from tqdm import tqdm
from collections import OrderedDict
import torch
def convert_model(config, ckpt):
# config
config_bmt = OrderedDict(
{
"_dtype": "bf16",
"activate_fn": "silu",
"architectures": [
"CPMDragonflyForCausalLM"
],
"model_type": "cpm_dragonfly",
"base": 10000,
"dim_ff": config['intermediate_size'],
"dim_head": config['hidden_size'] // config['num_attention_heads'],
"dim_model": config['hidden_size'],
"dim_model_base": 256,
"dropout_p": 0.0,
"eps": config['rms_norm_eps'],
"init_std": config['initializer_range'],
"num_heads": config['num_attention_heads'],
"num_kv_heads": config['num_key_value_heads'],
"num_layers": config['num_hidden_layers'],
"orig_max_length": 4096,
"pose_prob": 0.0,
"pose_scaling_factor": 1.0,
"qk_norm": False,
"rope_scaling_factor": 1,
"rope_scaling_type": "",
"scale": True,
"scale_depth": config['scale_depth'],
"scale_emb": config['scale_emb'],
"tie_lm_head": True,
"tp": 0,
"transformers_version": "4.35.0",
"vocab_size": config['vocab_size']
}
)
model_bmt = OrderedDict()
model_bmt["input_embedding.weight"] = ckpt['model.embed_tokens.weight'].contiguous()
model_bmt["encoder.output_layernorm.weight"] = ckpt['model.norm.weight'].contiguous()
for lnum in tqdm(range(config_bmt['num_layers'])):
hf_pfx = f"model.layers.{lnum}"
bmt_pfx = f"encoder.layers.{lnum}"
model_bmt[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = ckpt[f"{hf_pfx}.input_layernorm.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = ckpt[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = ckpt[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = ckpt[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = ckpt[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = ckpt[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = ckpt[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = ckpt[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
model_bmt[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = ckpt[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
return config_bmt, model_bmt
def load_model_ckpt(args):
with open(os.path.join(args.load, "config.json"), 'r') as fin:
config = json.load(fin)
ckpt = torch.load(os.path.join(args.load, "pytorch_model.bin"))
os.makedirs(f"{args.save}", exist_ok=True)
# model and config
hf_config, hf_ckpt = convert_model(config, ckpt)
with open(os.path.join(args.save, "config.json"), 'w') as fout:
json.dump(hf_config, fout, indent=4)
torch.save(hf_ckpt, f"{args.save}/pytorch_model.pt")
# tokenizer
shutil.copyfile(f"{args.load}/tokenizer.json", f"{args.save}/tokenizer.json")
shutil.copyfile(f"{args.load}/tokenizer.model", f"{args.save}/tokenizer.model")
shutil.copyfile(f"{args.load}/special_tokens_map.json", f"{args.save}/special_tokens_map.json")
shutil.copyfile(f"{args.load}/tokenizer_config.json", f"{args.save}/tokenizer_config.json")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--load", type=str, default="")
parser.add_argument("--save", type=str, default="")
args = parser.parse_args()
load_model_ckpt(args)
......@@ -40,7 +40,8 @@ params_dict = {
sampling_params = SamplingParams(**params_dict)
# Create an LLM.
llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
# llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='bfloat16')
llm = LLM(model=args.model_path, tensor_parallel_size=1, dtype='float16')
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
for prompt in prompts:
......
s': 4.996971607208252, 'eval_runtime': 1009.8753, 'eval_samples_per_second': 1.06, 'eval_steps_per_second': 1.06, 'epoch': 0.01}
50%|████████████████████████████████████████████████████████████████████████████████████████████████ | 1500/3000 [1:34:10<45:25, 1.82s/itSaving model checkpoint to output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500
/usr/local/lib/python3.8/site-packages/peft/utils/save_and_load.py:148: UserWarning: Could not find a config file in checkpoint/miniCPM-bf16/ - will assume that the vocabulary was not modified.
warnings.warn(
tokenizer config file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/tokenizer_config.json
Special tokens file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/special_tokens_map.json
[2024-02-20 16:35:20,179] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1500 is about to be saved!
/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
warnings.warn(
[2024-02-20 16:35:20,222] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/global_step1500/zero_pp_rank_0_mp_rank_00_model_states.pt
[2024-02-20 16:35:20,222] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/global_step1500/zero_pp_rank_0_mp_rank_00_model_states.pt...
[2024-02-20 16:35:20,260] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/global_step1500/zero_pp_rank_0_mp_rank_00_model_states.pt.
[2024-02-20 16:35:20,261] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2024-02-20 16:35:20,378] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2024-02-20 16:35:20,386] [INFO] [engine.py:3417:_save_zero_checkpoint] zero checkpoint saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-1500/global_step1500/zero_pp_rank_0_mp_rank_00_optim_states.pt
[2024-02-20 16:35:20,397] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1500 is ready now!
{'loss': 5.3635, 'learning_rate': 0.0005144827586206897, 'epoch': 0.01}
{'loss': 5.1967, 'learning_rate': 0.0005110344827586208, 'epoch': 0.01}
{'loss': 5.0065, 'learning_rate': 0.0005075862068965517, 'epoch': 0.01}
{'loss': 4.8759, 'learning_rate': 0.0005041379310344828, 'epoch': 0.01}
{'loss': 5.172, 'learning_rate': 0.0005006896551724138, 'epoch': 0.01}
{'loss': 5.0331, 'learning_rate': 0.0004972413793103449, 'epoch': 0.01}
{'loss': 4.9884, 'learning_rate': 0.0004937931034482758, 'epoch': 0.01}
{'loss': 4.9411, 'learning_rate': 0.0004903448275862069, 'epoch': 0.01}
{'loss': 4.9256, 'learning_rate': 0.00048689655172413796, 'epoch': 0.01}
{'loss': 4.9216, 'learning_rate': 0.00048344827586206896, 'epoch': 0.01}
{'loss': 5.1224, 'learning_rate': 0.00048, 'epoch': 0.01}
{'loss': 4.6957, 'learning_rate': 0.000476551724137931, 'epoch': 0.01}
{'loss': 5.1196, 'learning_rate': 0.0004731034482758621, 'epoch': 0.01}
{'loss': 4.7971, 'learning_rate': 0.0004696551724137931, 'epoch': 0.01}
{'loss': 4.9963, 'learning_rate': 0.0004662068965517242, 'epoch': 0.01}
{'loss': 4.8869, 'learning_rate': 0.0004627586206896552, 'epoch': 0.01}
{'loss': 5.0425, 'learning_rate': 0.00045931034482758624, 'epoch': 0.01}
{'loss': 4.9879, 'learning_rate': 0.00045586206896551724, 'epoch': 0.01}
{'loss': 4.7501, 'learning_rate': 0.0004524137931034483, 'epoch': 0.01}
{'loss': 5.0549, 'learning_rate': 0.0004489655172413793, 'epoch': 0.01}
{'loss': 4.8451, 'learning_rate': 0.0004455172413793104, 'epoch': 0.01}
{'loss': 4.8229, 'learning_rate': 0.0004420689655172414, 'epoch': 0.02}
{'loss': 5.2047, 'learning_rate': 0.00043862068965517246, 'epoch': 0.02}
{'loss': 5.1593, 'learning_rate': 0.00043517241379310346, 'epoch': 0.02}
{'loss': 5.1894, 'learning_rate': 0.00043172413793103446, 'epoch': 0.02}
{'loss': 4.8937, 'learning_rate': 0.0004282758620689655, 'epoch': 0.02}
{'loss': 5.2911, 'learning_rate': 0.0004248275862068965, 'epoch': 0.02}
{'loss': 4.9354, 'learning_rate': 0.0004213793103448276, 'epoch': 0.02}
{'loss': 5.0007, 'learning_rate': 0.00041793103448275863, 'epoch': 0.02}
{'loss': 4.9943, 'learning_rate': 0.0004144827586206897, 'epoch': 0.02}
{'loss': 4.9288, 'learning_rate': 0.0004110344827586207, 'epoch': 0.02}
{'loss': 4.6495, 'learning_rate': 0.00040758620689655174, 'epoch': 0.02}
{'loss': 4.8075, 'learning_rate': 0.00040413793103448274, 'epoch': 0.02}
{'loss': 5.047, 'learning_rate': 0.0004006896551724138, 'epoch': 0.02}
{'loss': 4.7298, 'learning_rate': 0.0003972413793103448, 'epoch': 0.02}
{'loss': 5.0614, 'learning_rate': 0.00039379310344827585, 'epoch': 0.02}
{'loss': 4.8462, 'learning_rate': 0.0003903448275862069, 'epoch': 0.02}
{'loss': 4.9027, 'learning_rate': 0.00038689655172413796, 'epoch': 0.02}
{'loss': 4.8653, 'learning_rate': 0.00038344827586206897, 'epoch': 0.02}
{'loss': 5.0492, 'learning_rate': 0.00038, 'epoch': 0.02}
{'loss': 5.0055, 'learning_rate': 0.000376551724137931, 'epoch': 0.02}
{'loss': 4.9203, 'learning_rate': 0.0003731034482758621, 'epoch': 0.02}
{'loss': 5.0601, 'learning_rate': 0.0003696551724137931, 'epoch': 0.02}
{'loss': 4.8342, 'learning_rate': 0.00036620689655172413, 'epoch': 0.02}
{'loss': 4.9579, 'learning_rate': 0.0003627586206896552, 'epoch': 0.02}
{'loss': 4.8838, 'learning_rate': 0.00035931034482758624, 'epoch': 0.02}
{'loss': 5.1245, 'learning_rate': 0.00035586206896551725, 'epoch': 0.02}
{'loss': 4.7047, 'learning_rate': 0.0003524137931034483, 'epoch': 0.02}
{'loss': 4.8103, 'learning_rate': 0.0003489655172413793, 'epoch': 0.02}
{'loss': 5.0187, 'learning_rate': 0.00034551724137931036, 'epoch': 0.02}
67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 2000/3000 [1:51:07<34:29, 2.07s/it]***** Running Evaluation *****
Num examples = 1070
Batch size = 1
{'eval_loss': 4.907186985015869, 'eval_runtime': 950.0193, 'eval_samples_per_second': 1.126, 'eval_steps_per_second': 1.126, 'epoch': 0.02}
67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 2000/3000 [2:06:57<34:29, 2.07s/itSaving model checkpoint to output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000
/usr/local/lib/python3.8/site-packages/peft/utils/save_and_load.py:148: UserWarning: Could not find a config file in checkpoint/miniCPM-bf16/ - will assume that the vocabulary was not modified.
warnings.warn(
tokenizer config file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/tokenizer_config.json
Special tokens file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/special_tokens_map.json
[2024-02-20 17:08:07,683] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is about to be saved!
/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
warnings.warn(
[2024-02-20 17:08:07,726] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt
[2024-02-20 17:08:07,726] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt...
[2024-02-20 17:08:07,760] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt.
[2024-02-20 17:08:07,762] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2024-02-20 17:08:07,864] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2024-02-20 17:08:07,875] [INFO] [engine.py:3417:_save_zero_checkpoint] zero checkpoint saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt
[2024-02-20 17:08:07,886] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2000 is ready now!
{'loss': 5.0544, 'learning_rate': 0.00034206896551724136, 'epoch': 0.02}
{'loss': 4.8734, 'learning_rate': 0.0003386206896551724, 'epoch': 0.02}
{'loss': 4.8588, 'learning_rate': 0.00033517241379310347, 'epoch': 0.02}
{'loss': 4.8809, 'learning_rate': 0.0003317241379310345, 'epoch': 0.02}
{'loss': 5.3568, 'learning_rate': 0.0003282758620689655, 'epoch': 0.02}
{'loss': 4.7724, 'learning_rate': 0.0003248275862068966, 'epoch': 0.02}
{'loss': 4.9533, 'learning_rate': 0.0003213793103448276, 'epoch': 0.02}
{'loss': 4.6215, 'learning_rate': 0.00031793103448275864, 'epoch': 0.02}
{'loss': 5.2978, 'learning_rate': 0.00031448275862068964, 'epoch': 0.02}
{'loss': 4.7191, 'learning_rate': 0.0003110344827586207, 'epoch': 0.02}
{'loss': 4.8991, 'learning_rate': 0.00030758620689655175, 'epoch': 0.02}
{'loss': 5.2957, 'learning_rate': 0.0003041379310344828, 'epoch': 0.02}
{'loss': 5.186, 'learning_rate': 0.0003006896551724138, 'epoch': 0.02}
{'loss': 4.4386, 'learning_rate': 0.00029724137931034486, 'epoch': 0.02}
{'loss': 4.9961, 'learning_rate': 0.00029379310344827586, 'epoch': 0.02}
{'loss': 4.8623, 'learning_rate': 0.0002903448275862069, 'epoch': 0.02}
{'loss': 4.6938, 'learning_rate': 0.0002868965517241379, 'epoch': 0.02}
{'loss': 5.0361, 'learning_rate': 0.000283448275862069, 'epoch': 0.02}
{'loss': 4.8112, 'learning_rate': 0.00028000000000000003, 'epoch': 0.02}
{'loss': 5.0209, 'learning_rate': 0.0002765517241379311, 'epoch': 0.02}
{'loss': 5.0334, 'learning_rate': 0.0002731034482758621, 'epoch': 0.02}
{'loss': 4.893, 'learning_rate': 0.00026965517241379314, 'epoch': 0.02}
{'loss': 4.7499, 'learning_rate': 0.00026620689655172414, 'epoch': 0.02}
{'loss': 5.2517, 'learning_rate': 0.00026275862068965514, 'epoch': 0.02}
{'loss': 5.1651, 'learning_rate': 0.0002593103448275862, 'epoch': 0.02}
{'loss': 5.0377, 'learning_rate': 0.0002558620689655172, 'epoch': 0.02}
{'loss': 5.0951, 'learning_rate': 0.0002524137931034483, 'epoch': 0.02}
{'loss': 4.9358, 'learning_rate': 0.0002489655172413793, 'epoch': 0.02}
{'loss': 4.9352, 'learning_rate': 0.00024551724137931037, 'epoch': 0.02}
{'loss': 4.8925, 'learning_rate': 0.0002420689655172414, 'epoch': 0.02}
{'loss': 4.9003, 'learning_rate': 0.00023862068965517242, 'epoch': 0.02}
{'loss': 4.95, 'learning_rate': 0.00023517241379310345, 'epoch': 0.02}
{'loss': 5.0561, 'learning_rate': 0.0002317241379310345, 'epoch': 0.02}
{'loss': 4.9288, 'learning_rate': 0.00022827586206896553, 'epoch': 0.02}
{'loss': 4.6204, 'learning_rate': 0.00022482758620689656, 'epoch': 0.02}
{'loss': 4.7895, 'learning_rate': 0.0002213793103448276, 'epoch': 0.02}
{'loss': 4.6687, 'learning_rate': 0.00021793103448275865, 'epoch': 0.02}
{'loss': 4.8489, 'learning_rate': 0.00021448275862068967, 'epoch': 0.02}
{'loss': 5.0163, 'learning_rate': 0.0002110344827586207, 'epoch': 0.02}
{'loss': 4.7498, 'learning_rate': 0.00020758620689655173, 'epoch': 0.02}
{'loss': 5.071, 'learning_rate': 0.0002041379310344828, 'epoch': 0.02}
{'loss': 4.6028, 'learning_rate': 0.00020068965517241381, 'epoch': 0.02}
{'loss': 5.2132, 'learning_rate': 0.00019724137931034484, 'epoch': 0.02}
{'loss': 5.3498, 'learning_rate': 0.00019379310344827587, 'epoch': 0.02}
{'loss': 4.8354, 'learning_rate': 0.0001903448275862069, 'epoch': 0.02}
{'loss': 4.7165, 'learning_rate': 0.00018689655172413795, 'epoch': 0.02}
{'loss': 5.1768, 'learning_rate': 0.00018344827586206898, 'epoch': 0.02}
{'loss': 4.8243, 'learning_rate': 0.00017999999999999998, 'epoch': 0.02}
{'loss': 4.8655, 'learning_rate': 0.000176551724137931, 'epoch': 0.02}
{'loss': 4.9831, 'learning_rate': 0.00017310344827586207, 'epoch': 0.02}
83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 2500/3000 [2:23:04<14:50, 1.78s/it]***** Running Evaluation *****
Num examples = 1070
Batch size = 1
{'eval_loss': 4.84500789642334, 'eval_runtime': 987.628, 'eval_samples_per_second': 1.083, 'eval_steps_per_second': 1.083, 'epoch': 0.02}
83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 2500/3000 [2:39:31<14:50, 1.78s/itSaving model checkpoint to output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500
/usr/local/lib/python3.8/site-packages/peft/utils/save_and_load.py:148: UserWarning: Could not find a config file in checkpoint/miniCPM-bf16/ - will assume that the vocabulary was not modified.
warnings.warn(
tokenizer config file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/tokenizer_config.json
Special tokens file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/special_tokens_map.json
[2024-02-20 17:40:42,775] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2500 is about to be saved!
/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
warnings.warn(
[2024-02-20 17:40:42,818] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/global_step2500/zero_pp_rank_0_mp_rank_00_model_states.pt
[2024-02-20 17:40:42,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/global_step2500/zero_pp_rank_0_mp_rank_00_model_states.pt...
[2024-02-20 17:40:42,853] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/global_step2500/zero_pp_rank_0_mp_rank_00_model_states.pt.
[2024-02-20 17:40:42,864] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2024-02-20 17:40:42,981] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2024-02-20 17:40:42,994] [INFO] [engine.py:3417:_save_zero_checkpoint] zero checkpoint saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-2500/global_step2500/zero_pp_rank_0_mp_rank_00_optim_states.pt
[2024-02-20 17:40:43,006] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2500 is ready now!
{'loss': 5.2281, 'learning_rate': 0.0001696551724137931, 'epoch': 0.02}
{'loss': 4.6762, 'learning_rate': 0.00016620689655172412, 'epoch': 0.02}
{'loss': 5.1642, 'learning_rate': 0.00016275862068965515, 'epoch': 0.02}
{'loss': 4.7472, 'learning_rate': 0.0001593103448275862, 'epoch': 0.02}
{'loss': 5.0087, 'learning_rate': 0.00015586206896551724, 'epoch': 0.02}
{'loss': 4.9628, 'learning_rate': 0.00015241379310344826, 'epoch': 0.02}
{'loss': 5.0469, 'learning_rate': 0.0001489655172413793, 'epoch': 0.02}
{'loss': 4.8538, 'learning_rate': 0.00014551724137931035, 'epoch': 0.02}
{'loss': 5.242, 'learning_rate': 0.00014206896551724138, 'epoch': 0.02}
{'loss': 4.8959, 'learning_rate': 0.0001386206896551724, 'epoch': 0.02}
{'loss': 4.8742, 'learning_rate': 0.00013517241379310343, 'epoch': 0.02}
{'loss': 4.958, 'learning_rate': 0.0001317241379310345, 'epoch': 0.02}
{'loss': 4.9919, 'learning_rate': 0.00012827586206896552, 'epoch': 0.02}
{'loss': 5.1347, 'learning_rate': 0.00012482758620689654, 'epoch': 0.02}
{'loss': 4.6106, 'learning_rate': 0.00012137931034482759, 'epoch': 0.02}
{'loss': 4.7297, 'learning_rate': 0.00011793103448275861, 'epoch': 0.02}
{'loss': 4.8609, 'learning_rate': 0.00011448275862068966, 'epoch': 0.02}
{'loss': 4.9314, 'learning_rate': 0.00011103448275862068, 'epoch': 0.02}
{'loss': 4.845, 'learning_rate': 0.00010758620689655173, 'epoch': 0.02}
{'loss': 4.8214, 'learning_rate': 0.00010413793103448275, 'epoch': 0.02}
{'loss': 4.6403, 'learning_rate': 0.0001006896551724138, 'epoch': 0.02}
{'loss': 4.8334, 'learning_rate': 9.724137931034482e-05, 'epoch': 0.02}
{'loss': 5.0028, 'learning_rate': 9.379310344827587e-05, 'epoch': 0.02}
{'loss': 4.7361, 'learning_rate': 9.03448275862069e-05, 'epoch': 0.02}
{'loss': 4.8325, 'learning_rate': 8.689655172413794e-05, 'epoch': 0.02}
{'loss': 4.5913, 'learning_rate': 8.344827586206896e-05, 'epoch': 0.02}
{'loss': 4.8138, 'learning_rate': 8e-05, 'epoch': 0.02}
92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 2770/3000 [2:47:46<06:39, 1.74s/it][2024-02-20 17:48:55,031] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 131072, but hysteresis is 2. Reducing hysteresis to 1
{'loss': 4.5148, 'learning_rate': 7.689655172413794e-05, 'epoch': 0.02}
{'loss': 4.7834, 'learning_rate': 7.344827586206897e-05, 'epoch': 0.02}
{'loss': 5.0962, 'learning_rate': 7.000000000000001e-05, 'epoch': 0.02}
{'loss': 4.8841, 'learning_rate': 6.655172413793104e-05, 'epoch': 0.02}
{'loss': 5.1609, 'learning_rate': 6.310344827586208e-05, 'epoch': 0.02}
{'loss': 4.8587, 'learning_rate': 5.9655172413793106e-05, 'epoch': 0.02}
{'loss': 4.5196, 'learning_rate': 5.620689655172414e-05, 'epoch': 0.02}
{'loss': 4.9806, 'learning_rate': 5.2758620689655176e-05, 'epoch': 0.02}
{'loss': 4.8975, 'learning_rate': 4.931034482758621e-05, 'epoch': 0.02}
{'loss': 4.6707, 'learning_rate': 4.5862068965517246e-05, 'epoch': 0.03}
{'loss': 4.9858, 'learning_rate': 4.2413793103448274e-05, 'epoch': 0.03}
{'loss': 4.9286, 'learning_rate': 3.896551724137931e-05, 'epoch': 0.03}
{'loss': 4.6976, 'learning_rate': 3.5517241379310344e-05, 'epoch': 0.03}
{'loss': 4.7789, 'learning_rate': 3.206896551724138e-05, 'epoch': 0.03}
{'loss': 5.1196, 'learning_rate': 2.8620689655172414e-05, 'epoch': 0.03}
{'loss': 4.8572, 'learning_rate': 2.517241379310345e-05, 'epoch': 0.03}
{'loss': 5.0144, 'learning_rate': 2.1724137931034484e-05, 'epoch': 0.03}
{'loss': 4.9704, 'learning_rate': 1.827586206896552e-05, 'epoch': 0.03}
{'loss': 4.8064, 'learning_rate': 1.4827586206896552e-05, 'epoch': 0.03}
{'loss': 4.851, 'learning_rate': 1.1379310344827587e-05, 'epoch': 0.03}
{'loss': 4.8067, 'learning_rate': 7.93103448275862e-06, 'epoch': 0.03}
{'loss': 4.562, 'learning_rate': 4.482758620689655e-06, 'epoch': 0.03}
{'loss': 4.8734, 'learning_rate': 1.0344827586206898e-06, 'epoch': 0.03}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [2:54:35<00:00, 1.77s/it]***** Running Evaluation *****
Num examples = 1070
Batch size = 1
{'eval_loss': 4.809491157531738, 'eval_runtime': 978.543, 'eval_samples_per_second': 1.093, 'eval_steps_per_second': 1.093, 'epoch': 0.03}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [3:10:54<00:00, 1.77s/itSaving model checkpoint to output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000
/usr/local/lib/python3.8/site-packages/peft/utils/save_and_load.py:148: UserWarning: Could not find a config file in checkpoint/miniCPM-bf16/ - will assume that the vocabulary was not modified.
warnings.warn(
tokenizer config file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/tokenizer_config.json
Special tokens file saved in output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/special_tokens_map.json
[2024-02-20 18:12:04,484] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is about to be saved!
/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
warnings.warn(
[2024-02-20 18:12:04,524] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_model_states.pt
[2024-02-20 18:12:04,524] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_model_states.pt...
[2024-02-20 18:12:04,556] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_model_states.pt.
[2024-02-20 18:12:04,557] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2024-02-20 18:12:04,654] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2024-02-20 18:12:04,664] [INFO] [engine.py:3417:_save_zero_checkpoint] zero checkpoint saved output/AdvertiseGenLoRA/20240220145905/tmp-checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt
[2024-02-20 18:12:04,675] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step3000 is ready now!
Training completed. Do not forget to share your model on huggingface.co/models =)
{'train_runtime': 11457.8744, 'train_samples_per_second': 0.262, 'train_steps_per_second': 0.262, 'train_loss': 5.106624983469645, 'epoch': 0.03}
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [3:10:57<00:00, 3.82s/it]
[2024-02-20 18:12:13,846] [INFO] [launch.py:347:main] Process 198 exits successfully.
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment