Initial commit

53b3977b · dongchy920 · 53b3977b · 53b3977b · 53b3977b · 53b3977b
Commit 53b3977b authored Jul 11, 2025 by dongchy920
20 changed files
--- a/LLaMA-Factory/data/mllm_demo_data/2.avi
+++ b/LLaMA-Factory/data/mllm_demo_data/2.avi
--- a/LLaMA-Factory/data/mllm_demo_data/2.jpg
+++ b/LLaMA-Factory/data/mllm_demo_data/2.jpg
--- a/LLaMA-Factory/data/mllm_demo_data/3.jpg
+++ b/LLaMA-Factory/data/mllm_demo_data/3.jpg
--- a/LLaMA-Factory/data/mllm_demo_data/3.mp4
+++ b/LLaMA-Factory/data/mllm_demo_data/3.mp4
--- a/LLaMA-Factory/data/mllm_video_demo.json
+++ b/LLaMA-Factory/data/mllm_video_demo.json
+[
+  {
+    "messages": [
+      {
+        "content": "<video>Why is this video funny?",
+        "role": "user"
+      },
+      {
+        "content": "Because a baby is reading, and he is so cute!",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/1.mp4"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<video>What is she doing?",
+        "role": "user"
+      },
+      {
+        "content": "She is cooking.",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/2.avi"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<video>What's in the video?",
+        "role": "user"
+      },
+      {
+        "content": "A baby is playing in the living room.",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/3.mp4"
+    ]
+  }
+]
--- a/LLaMA-Factory/data/ultra_chat/ultra_chat.py
+++ b/LLaMA-Factory/data/ultra_chat/ultra_chat.py
+import json
+import os
+from typing import List
+
+import datasets
+
+
+_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
+
+_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
+
+_CITATION = """\
+@misc{UltraChat,
+  author = {Ding, Ning and Chen, Yulin and Xu, Bokai and Hu, Shengding and Qin, Yujia and Liu, Zhiyuan and Sun, Maosong and Zhou, Bowen},
+  title = {UltraChat: A Large-scale Auto-generated Multi-round Dialogue Data},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/thunlp/ultrachat}},
+}
+"""
+
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
+_LICENSE = "cc-by-nc-4.0"
+_BASE_DATA_URL = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl"
+
+
+class UltraChat(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self):
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager):
+        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)]  # multiple shards
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
+
+    def _generate_examples(self, filepaths: List[str]):
+        for filepath in filepaths:
+            with open(filepath, encoding="utf-8") as f:
+                for row in f:
+                    try:
+                        data = json.loads(row)
+                    except Exception:
+                        continue
+                    key: int = data["id"]
+                    content: List[str] = data["data"]
+                    if len(content) % 2 == 1:
+                        content.pop(-1)
+                    if len(content) < 2:
+                        continue
+                    conversations = [
+                        {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
+                    ]
+                    yield key, {"conversations": conversations}
--- a/LLaMA-Factory/data/wiki_demo.txt
+++ b/LLaMA-Factory/data/wiki_demo.txt
--- a/LLaMA-Factory/deepspeed-0.14.2+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
+++ b/LLaMA-Factory/deepspeed-0.14.2+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
--- a/LLaMA-Factory/docker/docker-cuda/Dockerfile
+++ b/LLaMA-Factory/docker/docker-cuda/Dockerfile
--- a/LLaMA-Factory/docker/docker-cuda/docker-compose.yml
+++ b/LLaMA-Factory/docker/docker-cuda/docker-compose.yml
--- a/LLaMA-Factory/docker/docker-npu/Dockerfile
+++ b/LLaMA-Factory/docker/docker-npu/Dockerfile
--- a/LLaMA-Factory/docker/docker-npu/docker-compose.yml
+++ b/LLaMA-Factory/docker/docker-npu/docker-compose.yml
--- a/LLaMA-Factory/docker/docker-rocm/Dockerfile
+++ b/LLaMA-Factory/docker/docker-rocm/Dockerfile
--- a/LLaMA-Factory/docker/docker-rocm/docker-compose.yml
+++ b/LLaMA-Factory/docker/docker-rocm/docker-compose.yml
--- a/LLaMA-Factory/evaluation/ceval/ceval.py
+++ b/LLaMA-Factory/evaluation/ceval/ceval.py
--- a/LLaMA-Factory/evaluation/ceval/ceval.zip
+++ b/LLaMA-Factory/evaluation/ceval/ceval.zip
--- a/LLaMA-Factory/evaluation/ceval/mapping.json
+++ b/LLaMA-Factory/evaluation/ceval/mapping.json
--- a/LLaMA-Factory/evaluation/cmmlu/cmmlu.py
+++ b/LLaMA-Factory/evaluation/cmmlu/cmmlu.py
--- a/LLaMA-Factory/evaluation/cmmlu/cmmlu.zip
+++ b/LLaMA-Factory/evaluation/cmmlu/cmmlu.zip
--- a/LLaMA-Factory/evaluation/cmmlu/mapping.json
+++ b/LLaMA-Factory/evaluation/cmmlu/mapping.json