updata to v0.9.1_stable

2778a3d0 · luopl · e92143e3 · 2778a3d0 · 2778a3d0 · 2778a3d0
Commit 2778a3d0 authored Jan 16, 2025 by luopl
20 changed files
--- a/.dockerignore
+++ b/.dockerignore
@@ -7,6 +7,8 @@ data
 docker
 saves
 hf_cache
+ms_cache
+om_cache
 output
 .dockerignore
 .gitattributes

--- a/.env.local
+++ b/.env.local
 # Note: actually we do not support .env, just for reference
 # api
-API_HOST=0.0.0.0
+API_HOST=
-API_PORT=8000
+API_PORT=
 API_KEY=
-API_MODEL_NAME=gpt-3.5-turbo
+API_MODEL_NAME=
 FASTAPI_ROOT_PATH=
+MAX_CONCURRENT=
 # general
 DISABLE_VERSION_CHECK=
 FORCE_CHECK_IMPORTS=
 LLAMAFACTORY_VERBOSITY=
 USE_MODELSCOPE_HUB=
+USE_OPENMIND_HUB=
 RECORD_VRAM=
 # torchrun
 FORCE_TORCHRUN=
 MASTER_ADDR=
 MASTER_PORT=
 NNODES=
-RANK=
+NODE_RANK=
 NPROC_PER_NODE=
 # wandb
 WANDB_DISABLED=
-WANDB_PROJECT=huggingface
+WANDB_PROJECT=
 WANDB_API_KEY=
 # gradio ui
-GRADIO_SHARE=False
+GRADIO_SHARE=
-GRADIO_SERVER_NAME=0.0.0.0
+GRADIO_SERVER_NAME=
 GRADIO_SERVER_PORT=
 GRADIO_ROOT_PATH=
+GRADIO_IPV6=
 # setup
 ENABLE_SHORT_CONSOLE=1
 # reserved (do not use)

--- a/.gitignore
+++ b/.gitignore
@@ -159,9 +159,13 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+# vscode
+.vscode/
 # custom .gitignore
 ms_cache/
 hf_cache/
+om_cache/
 cache/
 config/
 saves/

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -   id: check-ast
+    -   id: check-added-large-files
+        args: ['--maxkb=25000']
+    -   id: check-merge-conflict
+    -   id: check-yaml
+    -   id: debug-statements
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+    -   id: no-commit-to-branch
+        args: ['--branch', 'main']
+-   repo: https://github.com/asottile/pyupgrade
+    rev: v3.17.0
+    hooks:
+    -   id: pyupgrade
+        args: [--py38-plus]
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+    -   id: ruff
+        args: [--fix]
+    -   id: ruff-format
--- a/Makefile
+++ b/Makefile
-.PHONY: quality style test
+.PHONY: build commit quality style test
 check_dirs := scripts src tests setup.py
+build:
+	pip install build && python -m build
+commit:
+	pre-commit install
+	pre-commit run --all-files
 quality:
 	ruff check $(check_dirs)
 	ruff format --check $(check_dirs)
@@ -11,4 +18,4 @@ style:
 	ruff format $(check_dirs)
 test:
-	CUDA_VISIBLE_DEVICES= pytest tests/
+	CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/
--- a/assets/benchmark.svg
+++ b/assets/benchmark.svg
--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/assets/wechat_npu.jpg
+++ b/assets/wechat_npu.jpg
--- a/data/alpaca_en_demo.json
+++ b/data/alpaca_en_demo.json
--- a/data/alpaca_zh_demo.json
+++ b/data/alpaca_zh_demo.json
--- a/data/belle_multiturn/belle_multiturn.py
+++ b/data/belle_multiturn/belle_multiturn.py
@@ -17,9 +17,9 @@ _CITATION = """\
 }
 """
-_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
 _LICENSE = "gpl-3.0"
-_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)
+_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
 class BelleMultiturn(datasets.GeneratorBasedBuilder):
@@ -38,7 +38,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
    def _generate_examples(self, filepath: str):
-        with open(filepath, "r", encoding="utf-8") as f:
+        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                conversations = []

--- a/data/c4_demo.json
+++ b/data/c4_demo.json
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -54,7 +54,8 @@
  },
  "alpaca_en": {
    "hf_hub_url": "llamafactory/alpaca_en",
-    "ms_hub_url": "llamafactory/alpaca_en"
+    "ms_hub_url": "llamafactory/alpaca_en",
+    "om_hub_url": "HaM/alpaca_en"
  },
  "alpaca_zh": {
    "hf_hub_url": "llamafactory/alpaca_zh",
@@ -66,7 +67,8 @@
  },
  "alpaca_gpt4_zh": {
    "hf_hub_url": "llamafactory/alpaca_gpt4_zh",
-    "ms_hub_url": "llamafactory/alpaca_gpt4_zh"
+    "ms_hub_url": "llamafactory/alpaca_gpt4_zh",
+    "om_hub_url": "State_Cloud/alpaca-gpt4-data-zh"
  },
  "glaive_toolcall_en": {
    "hf_hub_url": "llamafactory/glaive_toolcall_en",

--- a/data/dpo_en_demo.json
+++ b/data/dpo_en_demo.json
--- a/data/dpo_zh_demo.json
+++ b/data/dpo_zh_demo.json
--- a/data/glaive_toolcall_en_demo.json
+++ b/data/glaive_toolcall_en_demo.json
--- a/data/glaive_toolcall_zh_demo.json
+++ b/data/glaive_toolcall_zh_demo.json
--- a/data/hh_rlhf_en/hh_rlhf_en.py
+++ b/data/hh_rlhf_en/hh_rlhf_en.py
@@ -8,9 +8,9 @@ import datasets
 _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
 _CITATION = ""
-_HOMEPAGE = "{}/datasets/Anthropic/hh-rlhf".format(_HF_ENDPOINT)
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
 _LICENSE = "mit"
-_URL = "{}/datasets/Anthropic/hh-rlhf/resolve/main/".format(_HF_ENDPOINT)
+_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
 _URLS = {
    "train": [
        _URL + "harmless-base/train.jsonl.gz",
@@ -53,7 +53,7 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, filepaths: List[str]):
        key = 0
        for filepath in filepaths:
-            with open(filepath, "r", encoding="utf-8") as f:
+            with open(filepath, encoding="utf-8") as f:
                for row in f:
                    data = json.loads(row)
                    chosen = data["chosen"]

--- a/data/identity.json
+++ b/data/identity.json
--- a/data/kto_en_demo.json
+++ b/data/kto_en_demo.json