Commit 4a40151b authored by chenych's avatar chenych
Browse files

Update v0.8.3

parent 731cf9b8
...@@ -160,8 +160,6 @@ cython_debug/ ...@@ -160,8 +160,6 @@ cython_debug/
.idea/ .idea/
# custom .gitignore # custom .gitignore
ms_cache/
hf_cache/
cache/ cache/
config/ config/
saves/ saves/
......
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
sphinx:
configuration: docs/source/conf.py
formats:
- pdf
python:
install:
- requirements: docs/requirements-docs.txt
This diff is collapsed.
This diff is collapsed.
assets/wechat.jpg

122 KB | W: | H:

assets/wechat.jpg

142 KB | W: | H:

assets/wechat.jpg
assets/wechat.jpg
assets/wechat.jpg
assets/wechat.jpg
  • 2-up
  • Swipe
  • Onion skin
assets/wechat_npu.jpg

193 KB | W: | H:

assets/wechat_npu.jpg

147 KB | W: | H:

assets/wechat_npu.jpg
assets/wechat_npu.jpg
assets/wechat_npu.jpg
assets/wechat_npu.jpg
  • 2-up
  • Swipe
  • Onion skin
...@@ -266,13 +266,6 @@ ...@@ -266,13 +266,6 @@
"hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered", "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"magpie_ultra": {
"hf_hub_url": "argilla/magpie-ultra-v0.1",
"columns": {
"prompt": "instruction",
"response": "response"
}
},
"web_instruct": { "web_instruct": {
"hf_hub_url": "TIGER-Lab/WebInstructSub", "hf_hub_url": "TIGER-Lab/WebInstructSub",
"columns": { "columns": {
......
...@@ -189,12 +189,6 @@ llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml ...@@ -189,12 +189,6 @@ llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
``` ```
#### Full-Parameter Fine-Tuning using Adam-mini
```bash
llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
```
#### LoRA+ Fine-Tuning #### LoRA+ Fine-Tuning
```bash ```bash
......
我们提供了多样化的大模型微调示例脚本。 我们提供了多样化的大模型微调示例脚本。
请确保在 `llama_factory` 目录下执行下述命令。 请确保在 `LLaMA-Factory` 目录下执行下述命令。
## 目录 ## 目录
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
- [推理 LoRA 模型](#推理-lora-模型) - [推理 LoRA 模型](#推理-lora-模型)
- [杂项](#杂项) - [杂项](#杂项)
使用 `HIP_VISIBLE_DEVICES`选择计算设备。 使用 `CUDA_VISIBLE_DEVICES`(GPU)或 `ASCEND_RT_VISIBLE_DEVICES`(NPU)选择计算设备。
## 示例 ## 示例
...@@ -189,12 +189,6 @@ llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml ...@@ -189,12 +189,6 @@ llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
``` ```
#### 使用 Adam-mini 进行全参数训练
```bash
llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
```
#### LoRA+ 微调 #### LoRA+ 微调
```bash ```bash
......
...@@ -10,7 +10,6 @@ badam_mode: layer ...@@ -10,7 +10,6 @@ badam_mode: layer
badam_switch_mode: ascending badam_switch_mode: ascending
badam_switch_interval: 50 badam_switch_interval: 50
badam_verbose: 2 badam_verbose: 2
# deepspeed: examples/deepspeed/ds_z3_config.json
### dataset ### dataset
dataset: identity,alpaca_en_demo dataset: identity,alpaca_en_demo
...@@ -30,7 +29,7 @@ overwrite_output_dir: true ...@@ -30,7 +29,7 @@ overwrite_output_dir: true
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
gradient_accumulation_steps: 8 gradient_accumulation_steps: 8
learning_rate: 1.0e-5 learning_rate: 1.0e-4
num_train_epochs: 3.0 num_train_epochs: 3.0
lr_scheduler_type: cosine lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
......
...@@ -29,12 +29,11 @@ overwrite_output_dir: true ...@@ -29,12 +29,11 @@ overwrite_output_dir: true
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
gradient_accumulation_steps: 1 gradient_accumulation_steps: 1
learning_rate: 1.0e-5 learning_rate: 1.0e-4
num_train_epochs: 3.0 num_train_epochs: 3.0
lr_scheduler_type: cosine lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
pure_bf16: true pure_bf16: true
ddp_timeout: 180000000
### eval ### eval
val_size: 0.1 val_size: 0.1
......
...@@ -2,5 +2,5 @@ ...@@ -2,5 +2,5 @@
python scripts/llama_pro.py \ python scripts/llama_pro.py \
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \ --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
--output_dir models/llama3-8b-pro \ --output_dir models/llama3-8b-instruct-pro \
--num_expand 8 --num_expand 8
### model ### model
model_name_or_path: models/llama3-8b-pro model_name_or_path: models/llama3-8b-instruct-pro
### method ### method
stage: sft stage: sft
...@@ -18,7 +18,7 @@ overwrite_cache: true ...@@ -18,7 +18,7 @@ overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
### output ### output
output_dir: saves/llama3-8b-pro/freeze/sft output_dir: saves/llama3-8b-instruct-pro/freeze/sft
logging_steps: 10 logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
......
...@@ -26,7 +26,7 @@ overwrite_output_dir: true ...@@ -26,7 +26,7 @@ overwrite_output_dir: true
per_device_train_batch_size: 1 per_device_train_batch_size: 1
gradient_accumulation_steps: 8 gradient_accumulation_steps: 8
optim: paged_adamw_8bit optim: paged_adamw_8bit
learning_rate: 1.0e-5 learning_rate: 1.0e-4
num_train_epochs: 3.0 num_train_epochs: 3.0
lr_scheduler_type: cosine lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
......
...@@ -7,7 +7,7 @@ do_predict: true ...@@ -7,7 +7,7 @@ do_predict: true
finetuning_type: full finetuning_type: full
### dataset ### dataset
eval_dataset: identity,alpaca_en_demo dataset: identity,alpaca_en_demo
template: llama3 template: llama3
cutoff_len: 1024 cutoff_len: 1024
max_samples: 50 max_samples: 50
......
...@@ -25,7 +25,7 @@ overwrite_output_dir: true ...@@ -25,7 +25,7 @@ overwrite_output_dir: true
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
gradient_accumulation_steps: 2 gradient_accumulation_steps: 2
learning_rate: 1.0e-5 learning_rate: 1.0e-4
num_train_epochs: 3.0 num_train_epochs: 3.0
lr_scheduler_type: cosine lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
......
transformers>=4.41.2,<=4.43.4 transformers>=4.41.2
datasets>=2.16.0,<=2.20.0 datasets>=2.16.0
accelerate>=0.30.1,<=0.32.0 accelerate>=0.30.1
peft>=0.11.1,<=0.12.0 peft>=0.11.1
trl>=0.8.6,<=0.9.6 trl>=0.8.6
gradio>=4.0.0 gradio>=4.0.0
pandas>=2.0.0 pandas>=2.0.0
scipy scipy
......
...@@ -36,11 +36,9 @@ def calculate_flops( ...@@ -36,11 +36,9 @@ def calculate_flops(
""" """
with get_accelerator().device(0): with get_accelerator().device(0):
chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn)) chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.engine.model.device) fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
input_dict = {"input_ids": fake_input, "labels": fake_input.clone()} input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
flops, macs, params = get_model_profile( flops, macs, params = get_model_profile(chat_model.model, kwargs=input_dict, print_profile=True, detailed=True)
chat_model.engine.model, kwargs=input_dict, print_profile=True, detailed=True
)
print("FLOPs:", flops) print("FLOPs:", flops)
print("MACs:", macs) print("MACs:", macs)
print("Params:", params) print("Params:", params)
......
...@@ -43,7 +43,7 @@ def calculate_lr( ...@@ -43,7 +43,7 @@ def calculate_lr(
dataset_dir: str = "data", dataset_dir: str = "data",
template: str = "default", template: str = "default",
cutoff_len: int = 1024, # i.e. maximum input length during training cutoff_len: int = 1024, # i.e. maximum input length during training
is_mistral_or_gemma: bool = False, # mistral and gemma models opt for a smaller learning rate, is_mistral: bool = False, # mistral model uses a smaller learning rate,
packing: bool = False, packing: bool = False,
): ):
r""" r"""
...@@ -84,7 +84,7 @@ def calculate_lr( ...@@ -84,7 +84,7 @@ def calculate_lr(
valid_ratio = valid_tokens / total_tokens valid_ratio = valid_tokens / total_tokens
batch_valid_len = batch_max_len * valid_ratio batch_valid_len = batch_max_len * valid_ratio
lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS) # lr ~ sqrt(batch_size) lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS) # lr ~ sqrt(batch_size)
lr = lr / 6.0 if is_mistral_or_gemma else lr lr = lr / 6.0 if is_mistral else lr
print( print(
"Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format( "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format(
lr, valid_ratio * 100, batch_valid_len lr, valid_ratio * 100, batch_valid_len
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
import json import json
import os import os
from collections import OrderedDict from collections import OrderedDict
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import fire import fire
import torch import torch
...@@ -47,8 +47,8 @@ def block_expansion( ...@@ -47,8 +47,8 @@ def block_expansion(
model_name_or_path: str, model_name_or_path: str,
output_dir: str, output_dir: str,
num_expand: int, num_expand: int,
shard_size: str = "2GB", shard_size: Optional[str] = "2GB",
save_safetensors: bool = True, save_safetensors: Optional[bool] = False,
): ):
r""" r"""
Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models. Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment