Update v0.8.3

4a40151b · chenych · 731cf9b8 · 4a40151b · 731cf9b8 · 4a40151b
Commit 4a40151b authored Nov 05, 2024 by chenych
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -160,8 +160,6 @@ cython_debug/
 .idea/
 # custom .gitignore
-ms_cache/
-hf_cache/
 cache/
 config/
 saves/

--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
-# Read the Docs configuration file
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-version: 2
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.8"
-sphinx:
-  configuration: docs/source/conf.py
-formats:
-  - pdf
-python:
-  install:
-  - requirements: docs/requirements-docs.txt
--- a/README_en.md
+++ b/README_en.md
--- a/README_zh.md
+++ b/README_zh.md
--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/assets/wechat_npu.jpg
+++ b/assets/wechat_npu.jpg
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -266,13 +266,6 @@
    "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
    "formatting": "sharegpt"
  },
-  "magpie_ultra": {
-    "hf_hub_url": "argilla/magpie-ultra-v0.1",
-    "columns": {
-      "prompt": "instruction",
-      "response": "response"
-    }
-  },
  "web_instruct": {
    "hf_hub_url": "TIGER-Lab/WebInstructSub",
    "columns": {

--- a/examples/README.md
+++ b/examples/README.md
@@ -189,12 +189,6 @@ llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 ```
-#### Full-Parameter Fine-Tuning using Adam-mini
-```bash
-llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
-```
 #### LoRA+ Fine-Tuning
 ```bash

--- a/examples/README_zh.md
+++ b/examples/README_zh.md
 我们提供了多样化的大模型微调示例脚本。
-请确保在 `llama_factory` 目录下执行下述命令。
+请确保在 `LLaMA-Factory` 目录下执行下述命令。
 ## 目录
@@ -11,7 +11,7 @@
 - [推理 LoRA 模型](#推理-lora-模型)
 - [杂项](#杂项)
-使用 `HIP_VISIBLE_DEVICES`选择计算设备。
+使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。
 ## 示例
@@ -189,12 +189,6 @@ llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 ```
-#### 使用 Adam-mini 进行全参数训练
-```bash
-llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
-```
 #### LoRA+ 微调
 ```bash

--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -10,7 +10,6 @@ badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
-# deepspeed: examples/deepspeed/ds_z3_config.json
 ### dataset
 dataset: identity,alpaca_en_demo
@@ -30,7 +29,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1

--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -29,12 +29,11 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 1
-learning_rate: 1.0e-5
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 pure_bf16: true
-ddp_timeout: 180000000
 ### eval
 val_size: 0.1

--- a/examples/extras/llama_pro/expand.sh
+++ b/examples/extras/llama_pro/expand.sh
@@ -2,5 +2,5 @@
 python scripts/llama_pro.py \
    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --output_dir models/llama3-8b-pro \
+    --output_dir models/llama3-8b-instruct-pro \
    --num_expand 8
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
 ### model
-model_name_or_path: models/llama3-8b-pro
+model_name_or_path: models/llama3-8b-instruct-pro
 ### method
 stage: sft
@@ -18,7 +18,7 @@ overwrite_cache: true
 preprocessing_num_workers: 16
 ### output
-output_dir: saves/llama3-8b-pro/freeze/sft
+output_dir: saves/llama3-8b-instruct-pro/freeze/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -26,7 +26,7 @@ overwrite_output_dir: true
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 optim: paged_adamw_8bit
-learning_rate: 1.0e-5
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1

--- a/examples/train_full/llama3_full_predict.yaml
+++ b/examples/train_full/llama3_full_predict.yaml
@@ -7,7 +7,7 @@ do_predict: true
 finetuning_type: full
 ### dataset
-eval_dataset: identity,alpaca_en_demo
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 50

--- a/examples/train_full/llama3_full_sft_ds3.yaml
+++ b/examples/train_full/llama3_full_sft_ds3.yaml
@@ -25,7 +25,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
-learning_rate: 1.0e-5
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1

--- a/requirements.txt
+++ b/requirements.txt
-transformers>=4.41.2,<=4.43.4
+transformers>=4.41.2
-datasets>=2.16.0,<=2.20.0
+datasets>=2.16.0
-accelerate>=0.30.1,<=0.32.0
+accelerate>=0.30.1
-peft>=0.11.1,<=0.12.0
+peft>=0.11.1
-trl>=0.8.6,<=0.9.6
+trl>=0.8.6
 gradio>=4.0.0
 pandas>=2.0.0
 scipy

--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -36,11 +36,9 @@ def calculate_flops(
    """
    with get_accelerator().device(0):
        chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
-        fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.engine.model.device)
+        fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
        input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
-        flops, macs, params = get_model_profile(
+        flops, macs, params = get_model_profile(chat_model.model, kwargs=input_dict, print_profile=True, detailed=True)
-            chat_model.engine.model, kwargs=input_dict, print_profile=True, detailed=True
-        )
        print("FLOPs:", flops)
        print("MACs:", macs)
        print("Params:", params)

--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -43,7 +43,7 @@ def calculate_lr(
    dataset_dir: str = "data",
    template: str = "default",
    cutoff_len: int = 1024,  # i.e. maximum input length during training
-    is_mistral_or_gemma: bool = False,  # mistral and gemma models opt for a smaller learning rate,
+    is_mistral: bool = False,  # mistral model uses a smaller learning rate,
    packing: bool = False,
 ):
    r"""
@@ -84,7 +84,7 @@ def calculate_lr(
    valid_ratio = valid_tokens / total_tokens
    batch_valid_len = batch_max_len * valid_ratio
    lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS)  # lr ~ sqrt(batch_size)
-    lr = lr / 6.0 if is_mistral_or_gemma else lr
+    lr = lr / 6.0 if is_mistral else lr
    print(
        "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format(
            lr, valid_ratio * 100, batch_valid_len

--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -19,7 +19,7 @@
 import json
 import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 import fire
 import torch
@@ -47,8 +47,8 @@ def block_expansion(
    model_name_or_path: str,
    output_dir: str,
    num_expand: int,
-    shard_size: str = "2GB",
+    shard_size: Optional[str] = "2GB",
-    save_safetensors: bool = True,
+    save_safetensors: Optional[bool] = False,
 ):
    r"""
    Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.