v1.0

4d4d8f59 · chenzk · 4d4d8f59 · 4d4d8f59 · 4d4d8f59 · 4d4d8f59
Commit 4d4d8f59 authored Jun 04, 2025 by chenzk
20 changed files
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
+{
+  "tests": [
+    {
+      "name": "quick_smoke_test_sft",
+      "input": {
+        "user_id": "user",
+        "model_id": "llama-test",
+        "run_id": "llama-test",
+        "credentials": {
+          "wandb_api_key": "",
+          "hf_token": ""
+        },
+        "args": {
+          "base_model": "HuggingFaceTB/SmolLM2-135M",
+          "model_type": "AutoModelForCausalLM",
+          "tokenizer_type": "AutoTokenizer",
+          "load_in_4bit": true,
+          "strict": false,
+          "datasets": [
+            {
+              "path": "mhenrichsen/alpaca_2k_test",
+              "type": "alpaca",
+              "split": "train[:10%]"
+            }
+          ],
+          "val_set_size": 0.02,
+          "output_dir": "./outputs/lora-out",
+          "sequence_len": 4096,
+          "sample_packing": true,
+          "eval_sample_packing": false,
+          "pad_to_sequence_len": true,
+          "adapter": "qlora",
+          "lora_r": 32,
+          "lora_alpha": 64,
+          "lora_dropout": 0.05,
+          "lora_target_linear": true,
+          "lora_modules_to_save": [
+            "embed_tokens",
+            "lm_head"
+          ],
+          "gradient_accumulation_steps": 2,
+          "micro_batch_size": 1,
+          "num_epochs": 1,
+          "optimizer": "adamw_torch_fused",
+          "lr_scheduler": "cosine",
+          "learning_rate": 0.0002,
+          "train_on_inputs": false,
+          "group_by_length": false,
+          "bf16": "auto",
+          "tf32": true,
+          "gradient_checkpointing": true,
+          "logging_steps": 1,
+          "flash_attention": true,
+          "warmup_steps": 1,
+          "evals_per_epoch": 1,
+          "eval_max_new_tokens": 128,
+          "saves_per_epoch": 1,
+          "weight_decay": 0.0,
+          "special_tokens": {
+            "pad_token": "<|endoftext|>"
+          },
+          "max_steps": 20
+        }
+      },
+      "timeout": 100000
+    }
+  ],
+  "config": {
+    "gpuTypeId": "NVIDIA GeForce RTX 4090",
+    "gpuCount": 1,
+    "containerDiskInGb": 200,
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "value": ""
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "value": "true"
+      }
+    ],
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ]
+  }
+}
--- a/.vscode/README.md
+++ b/.vscode/README.md
+See [docs/debugging.md](../docs/debugging.md) for guidance on how to modify these files to debug axolotl with VSCode.
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug axolotl prompt - sharegpt",
+            "type": "python",
+            "module": "accelerate.commands.launch",
+            "request": "launch",
+            "args": [
+                "-m", "axolotl.cli.train", "dev_sharegpt.yml",
+                // The flags below simplify debugging by overriding the axolotl config
+                // with the debugging tips above.  Modify as needed.
+                "--dataset_processes=1",      // limits data preprocessing to one process
+                "--max_steps=1",              // limits training to just one step
+                "--batch_size=1",             // minimizes batch size
+                "--micro_batch_size=1",       // minimizes batch size
+                "--val_set_size=0",           // disables validation
+                "--sample_packing=False",     // disables sample packing which is necessary for small datasets
+                "--eval_sample_packing=False",// disables sample packing on eval set
+                "--dataset_prepared_path=temp_debug/axolotl_outputs/data", // send data outputs to a temp folder
+                "--output_dir=temp_debug/axolotl_outputs/model" // send model outputs to a temp folder
+                ],
+            "console": "integratedTerminal",      // show output in the integrated terminal
+            "cwd": "${workspaceFolder}/devtools", // set working directory to devtools from the root of the project
+            "justMyCode": true,                   // step through only axolotl code
+            "env": {"CUDA_VISIBLE_DEVICES": "0",  // Since we aren't doing distributed training, we need to limit to one GPU
+                    "HF_HOME": "${workspaceFolder}/devtools/temp_debug/.hf-cache"}, // send HF cache to a temp folder
+            "preLaunchTask": "cleanup-for-dataprep", // delete temp folders (see below)
+        }
+    ]
+}
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
+//this file is used by launch.json
+{
+  "version": "2.0.0",
+  "tasks": [
+    // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder
+    {
+      "label": "delete-outputs",
+      "type": "shell",
+      "command": "rm -rf temp_debug/axolotl_outputs",
+      "options":{ "cwd": "${workspaceFolder}/devtools"},
+      "problemMatcher": []
+    },
+    // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder
+    {
+      "label": "delete-temp-hf-dataset-cache",
+      "type": "shell",
+      "command": "rm -rf temp_debug/.hf-cache/datasets",
+      "options":{ "cwd": "${workspaceFolder}/devtools"},
+      "problemMatcher": []
+    },
+      // this task combines the two tasks above
+    {
+     "label": "cleanup-for-dataprep",
+     "dependsOn": ["delete-outputs", "delete-temp-hf-dataset-cache"],
+    }
+  ]
+}
--- a/CNAME
+++ b/CNAME
+docs.axolotl.ai
--- a/FAQS.md
+++ b/FAQS.md
+# FAQs
+- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
+- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
+- `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c`
+`/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
+This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include requirements.txt
+include README.md
+include LICENSE
+include src/setuptools_axolotl_dynamic_dependencies.py
+recursive-include axolotl *.py
--- a/Qwen/Qwen3-4B/README.md
+++ b/Qwen/Qwen3-4B/README.md
+---
+library_name: transformers
+license: apache-2.0
+license_link: https://huggingface.co/Qwen/Qwen3-4B/blob/main/LICENSE
+pipeline_tag: text-generation
+base_model:
+- Qwen/Qwen3-4B-Base
+---
+# Qwen3-4B
+<a href="https://chat.qwen.ai/" target="_blank" style="margin: 2px;">
+    <img alt="Chat" src="https://img.shields.io/badge/%F0%9F%92%9C%EF%B8%8F%20Qwen%20Chat%20-536af5" style="display: inline-block; vertical-align: middle;"/>
+</a>
+## Qwen3 Highlights
+Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support, with the following key features:
+- **Uniquely support of seamless switching between thinking mode** (for complex logical reasoning, math, and coding) and **non-thinking mode** (for efficient, general-purpose dialogue) **within single model**, ensuring optimal performance across various scenarios.
+- **Significantly enhancement in its reasoning capabilities**, surpassing previous QwQ (in thinking mode) and Qwen2.5 instruct models (in non-thinking mode) on mathematics, code generation, and commonsense logical reasoning.
+- **Superior human preference alignment**, excelling in creative writing, role-playing, multi-turn dialogues, and instruction following, to deliver a more natural, engaging, and immersive conversational experience.
+- **Expertise in agent capabilities**, enabling precise integration with external tools in both thinking and unthinking modes and achieving leading performance among open-source models in complex agent-based tasks.
+- **Support of 100+ languages and dialects** with strong capabilities for **multilingual instruction following** and **translation**.
+## Model Overview
+**Qwen3-4B** has the following features:
+- Type: Causal Language Models
+- Training Stage: Pretraining & Post-training
+- Number of Parameters: 4.0B
+- Number of Paramaters (Non-Embedding): 3.6B
+- Number of Layers: 36
+- Number of Attention Heads (GQA): 32 for Q and 8 for KV
+- Context Length: 32,768 natively and [131,072 tokens with YaRN](#processing-long-texts). 
+For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our [blog](https://qwenlm.github.io/blog/qwen3/), [GitHub](https://github.com/QwenLM/Qwen3), and [Documentation](https://qwen.readthedocs.io/en/latest/).
+> [!TIP]
+> If you encounter significant endless repetitions, please refer to the [Best Practices](#best-practices) section for optimal sampling parameters, and set the ``presence_penalty`` to 1.5.
+## Quickstart
+The code of Qwen3 has been in the latest Hugging Face `transformers` and we advise you to use the latest version of `transformers`.
+With `transformers<4.51.0`, you will encounter the following error:
+```
+KeyError: 'qwen3'
+```
+The following contains a code snippet illustrating how to use the model generate content based on given inputs. 
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "Qwen/Qwen3-4B"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+# prepare the model input
+prompt = "Give me a short introduction to large language model."
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32768
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
+# parsing thinking content
+try:
+    # rindex finding 151668 (</think>)
+    index = len(output_ids) - output_ids[::-1].index(151668)
+except ValueError:
+    index = 0
+thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
+content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
+print("thinking content:", thinking_content)
+print("content:", content)
+```
+For deployment, you can use `sglang>=0.4.6.post1` or `vllm>=0.8.5` or to create an OpenAI-compatible API endpoint:
+- SGLang:
+    ```shell
+    python -m sglang.launch_server --model-path Qwen/Qwen3-4B --reasoning-parser qwen3
+    ```
+- vLLM:
+    ```shell
+    vllm serve Qwen/Qwen3-4B --enable-reasoning --reasoning-parser deepseek_r1
+    ```
+For local use, applications such as Ollama, LMStudio, MLX-LM, llama.cpp, and KTransformers have also supported Qwen3.
+## Switching Between Thinking and Non-Thinking Mode
+> [!TIP]
+> The `enable_thinking` switch is also available in APIs created by SGLang and vLLM. 
+> Please refer to our documentation for [SGLang](https://qwen.readthedocs.io/en/latest/deployment/sglang.html#thinking-non-thinking-modes) and [vLLM](https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes) users.
+### `enable_thinking=True`
+By default, Qwen3 has thinking capabilities enabled, similar to QwQ-32B. This means the model will use its reasoning abilities to enhance the quality of generated responses. For example, when explicitly setting `enable_thinking=True` or leaving it as the default value in `tokenizer.apply_chat_template`, the model will engage its thinking mode.
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True  # True is the default value for enable_thinking
+)
+```
+In this mode, the model will generate think content wrapped in a `<think>...</think>` block, followed by the final response.
+> [!NOTE]
+> For thinking mode, use `Temperature=0.6`, `TopP=0.95`, `TopK=20`, and `MinP=0` (the default setting in `generation_config.json`). **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions. For more detailed guidance, please refer to the [Best Practices](#best-practices) section.
+### `enable_thinking=False`
+We provide a hard switch to strictly disable the model's thinking behavior, aligning its functionality with the previous Qwen2.5-Instruct models. This mode is particularly useful in scenarios where disabling thinking is essential for enhancing efficiency.
+```python
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=False  # Setting enable_thinking=False disables thinking mode
+)
+```
+In this mode, the model will not generate any think content and will not include a `<think>...</think>` block.
+> [!NOTE]
+> For non-thinking mode, we suggest using `Temperature=0.7`, `TopP=0.8`, `TopK=20`, and `MinP=0`. For more detailed guidance, please refer to the [Best Practices](#best-practices) section.
+### Advanced Usage: Switching Between Thinking and Non-Thinking Modes via User Input
+We provide a soft switch mechanism that allows users to dynamically control the model's behavior when `enable_thinking=True`. Specifically, you can add `/think` and `/no_think` to user prompts or system messages to switch the model's thinking mode from turn to turn. The model will follow the most recent instruction in multi-turn conversations.
+Here is an example of a multi-turn conversation:
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class QwenChatbot:
+    def __init__(self, model_name="Qwen/Qwen3-4B"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+        self.history = []
+    def generate_response(self, user_input):
+        messages = self.history + [{"role": "user", "content": user_input}]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        inputs = self.tokenizer(text, return_tensors="pt")
+        response_ids = self.model.generate(**inputs, max_new_tokens=32768)[0][len(inputs.input_ids[0]):].tolist()
+        response = self.tokenizer.decode(response_ids, skip_special_tokens=True)
+        # Update history
+        self.history.append({"role": "user", "content": user_input})
+        self.history.append({"role": "assistant", "content": response})
+        return response
+# Example Usage
+if __name__ == "__main__":
+    chatbot = QwenChatbot()
+    # First input (without /think or /no_think tags, thinking mode is enabled by default)
+    user_input_1 = "How many r's in strawberries?"
+    print(f"User: {user_input_1}")
+    response_1 = chatbot.generate_response(user_input_1)
+    print(f"Bot: {response_1}")
+    print("----------------------")
+    # Second input with /no_think
+    user_input_2 = "Then, how many r's in blueberries? /no_think"
+    print(f"User: {user_input_2}")
+    response_2 = chatbot.generate_response(user_input_2)
+    print(f"Bot: {response_2}") 
+    print("----------------------")
+    # Third input with /think
+    user_input_3 = "Really? /think"
+    print(f"User: {user_input_3}")
+    response_3 = chatbot.generate_response(user_input_3)
+    print(f"Bot: {response_3}")
+```
+> [!NOTE]
+> For API compatibility, when `enable_thinking=True`, regardless of whether the user uses `/think` or `/no_think`, the model will always output a block wrapped in `<think>...</think>`. However, the content inside this block may be empty if thinking is disabled.
+> When `enable_thinking=False`, the soft switches are not valid. Regardless of any `/think` or `/no_think` tags input by the user, the model will not generate think content and will not include a `<think>...</think>` block.
+## Agentic Use
+Qwen3 excels in tool calling capabilities. We recommend using [Qwen-Agent](https://github.com/QwenLM/Qwen-Agent) to make the best use of agentic ability of Qwen3. Qwen-Agent encapsulates tool-calling templates and tool-calling parsers internally, greatly reducing coding complexity.
+To define the available tools, you can use the MCP configuration file, use the integrated tool of Qwen-Agent, or integrate other tools by yourself.
+```python
+from qwen_agent.agents import Assistant
+# Define LLM
+llm_cfg = {
+    'model': 'Qwen3-4B',
+    # Use the endpoint provided by Alibaba Model Studio:
+    # 'model_type': 'qwen_dashscope',
+    # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
+    # Use a custom endpoint compatible with OpenAI API:
+    'model_server': 'http://localhost:8000/v1',  # api_base
+    'api_key': 'EMPTY',
+    # Other parameters:
+    # 'generate_cfg': {
+    #         # Add: When the response content is `<think>this is the thought</think>this is the answer;
+    #         # Do not add: When the response has been separated by reasoning_content and content.
+    #         'thought_in_content': True,
+    #     },
+}
+# Define Tools
+tools = [
+    {'mcpServers': {  # You can specify the MCP configuration file
+            'time': {
+                'command': 'uvx',
+                'args': ['mcp-server-time', '--local-timezone=Asia/Shanghai']
+            },
+            "fetch": {
+                "command": "uvx",
+                "args": ["mcp-server-fetch"]
+            }
+        }
+    },
+  'code_interpreter',  # Built-in tools
+]
+# Define Agent
+bot = Assistant(llm=llm_cfg, function_list=tools)
+# Streaming generation
+messages = [{'role': 'user', 'content': 'https://qwenlm.github.io/blog/ Introduce the latest developments of Qwen'}]
+for responses in bot.run(messages=messages):
+    pass
+print(responses)
+```
+## Processing Long Texts
+Qwen3 natively supports context lengths of up to 32,768 tokens. For conversations where the total length (including both input and output) significantly exceeds this limit, we recommend using RoPE scaling techniques to handle long texts effectively. We have validated the model's performance on context lengths of up to 131,072 tokens using the [YaRN](https://arxiv.org/abs/2309.00071) method.
+YaRN is currently supported by several inference frameworks, e.g., `transformers` and `llama.cpp` for local use, `vllm` and `sglang` for deployment. In general, there are two approaches to enabling YaRN for supported frameworks:
+- Modifying the model files:
+  In the `config.json` file, add the `rope_scaling` fields:
+    ```json
+    {
+        ...,
+        "rope_scaling": {
+            "rope_type": "yarn",
+            "factor": 4.0,
+            "original_max_position_embeddings": 32768
+        }
+    }
+    ```
+  For `llama.cpp`, you need to regenerate the GGUF file after the modification.
+- Passing command line arguments:
+  For `vllm`, you can use
+    ```shell
+    vllm serve ... --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --max-model-len 131072  
+    ```
+  For `sglang`, you can use
+    ```shell
+    python -m sglang.launch_server ... --json-model-override-args '{"rope_scaling":{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}}'
+    ```
+  For `llama-server` from `llama.cpp`, you can use
+    ```shell
+    llama-server ... --rope-scaling yarn --rope-scale 4 --yarn-orig-ctx 32768
+    ```
+> [!IMPORTANT]
+> If you encounter the following warning
+> ```
+> Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'original_max_position_embeddings'}
+> ```
+> please upgrade `transformers>=4.51.0`.
+> [!NOTE]
+> All the notable open-source frameworks implement static YaRN, which means the scaling factor remains constant regardless of input length, **potentially impacting performance on shorter texts.**
+> We advise adding the `rope_scaling` configuration only when processing long contexts is required. 
+> It is also recommended to modify the `factor` as needed. For example, if the typical context length for your application is 65,536 tokens, it would be better to set `factor` as 2.0. 
+> [!NOTE]
+> The default `max_position_embeddings` in `config.json` is set to 40,960. This allocation includes reserving 32,768 tokens for outputs and 8,192 tokens for typical prompts, which is sufficient for most scenarios involving short text processing. If the average context length does not exceed 32,768 tokens, we do not recommend enabling YaRN in this scenario, as it may potentially degrade model performance.
+> [!TIP]
+> The endpoint provided by Alibaba Model Studio supports dynamic YaRN by default and no extra configuration is needed.
+## Best Practices
+To achieve optimal performance, we recommend the following settings:
+1. **Sampling Parameters**:
+   - For thinking mode (`enable_thinking=True`), use `Temperature=0.6`, `TopP=0.95`, `TopK=20`, and `MinP=0`. **DO NOT use greedy decoding**, as it can lead to performance degradation and endless repetitions.
+   - For non-thinking mode (`enable_thinking=False`), we suggest using `Temperature=0.7`, `TopP=0.8`, `TopK=20`, and `MinP=0`.
+   - For supported frameworks, you can adjust the `presence_penalty` parameter between 0 and 2 to reduce endless repetitions. However, using a higher value may occasionally result in language mixing and a slight decrease in model performance.
+2. **Adequate Output Length**: We recommend using an output length of 32,768 tokens for most queries. For benchmarking on highly complex problems, such as those found in math and programming competitions, we suggest setting the max output length to 38,912 tokens. This provides the model with sufficient space to generate detailed and comprehensive responses, thereby enhancing its overall performance.
+3. **Standardize Output Format**: We recommend using prompts to standardize model outputs when benchmarking.
+   - **Math Problems**: Include "Please reason step by step, and put your final answer within \boxed{}." in the prompt.
+   - **Multiple-Choice Questions**: Add the following JSON structure to the prompt to standardize responses: "Please show your choice in the `answer` field with only the choice letter, e.g., `"answer": "C"`."
+4. **No Thinking Content in History**: In multi-turn conversations, the historical model output should only include the final output part and does not need to include the thinking content. It is implemented in the provided chat template in Jinja2. However, for frameworks that do not directly use the Jinja2 chat template, it is up to the developers to ensure that the best practice is followed.
+### Citation
+If you find our work helpful, feel free to give us a cite.
+```
+@misc{qwen3technicalreport,
+      title={Qwen3 Technical Report}, 
+      author={Qwen Team},
+      year={2025},
+      eprint={2505.09388},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.09388}, 
+}
+```
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# Qwen3
+同时具备Distil+SFT+RL的方法开源，人工智能大厂所用的主流后训练方法所有人都可以轻松拥有。
+## 论文
+`无`
+## 模型结构
+Qwen3采用通用的Decoder-Only结构，引入了MoE提升性能，首个「混合推理模型」，通过标注标记词将「快思考」与「慢思考」集成进同一个模型，本步骤以Qwen3-4B作为示例，其它模型根据axolotl官方说明以此类推。
+<div align=center>
+    <img src="./doc/qwen.png"/>
+</div>
+## 算法原理
+将输入embedding后放入attention、ffn等提取特征，最后利用Softmax将解码器最后一层产生的未经归一化的分数向量（logits）转换为概率分布，其中每个元素表示生成对应词汇的概率，这使得模型可以生成一个分布，并从中选择最可能的词作为预测结果。
+## 环境配置
+```
+mv axolotl-Qwen3-4B_pytorch axolotl # 去框架名后缀
+```
+### Docker（方法一）
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
+# <your IMAGE ID>为以上拉取的docker的镜像ID替换，本镜像为：1fad5f9ac556
+docker run -it --shm-size=64G -v $PWD/axolotl:/home/axolotl -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name axo <your IMAGE ID> bash
+cd /home/axolotl
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple
+pip install whl/bitsandbytes-0.42.0+das.opt1.dtk2504-py3-none-any.whl
+```
+### Dockerfile（方法二）
+```
+cd /home/axolotl/docker
+docker build --no-cache -t axo:latest .
+docker run --shm-size=64G --name axo -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v $PWD/../../axolotl:/home/axolotl -it axo bash
+# 若遇到Dockerfile启动的方式安装环境需要长时间等待，可注释掉里面的pip安装，启动容器后再安装python库：pip install -r requirements.txt。
+cd /home/axolotl
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple
+pip install whl/bitsandbytes-0.42.0+das.opt1.dtk2504-py3-none-any.whl
+```
+### Anaconda（方法三）
+1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装：
+- https://developer.hpccube.com/tool/
+```
+DTK驱动:dtk2504
+python:python3.10
+torch:2.4.1
+torchvision:0.19.1
+triton:3.0.0
+vllm:0.8.5
+flash-attn:2.6.1
+deepspeed:0.14.2
+apex:1.4.0
+bitsandbytes:0.42.0
+transformers:4.51.3
+```
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。`
+2、其它非特殊库参照requirements.txt安装
+```
+cd /home/axolotl
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
+pip install -e . -i https://mirrors.aliyun.com/pypi/simple
+pip install whl/bitsandbytes-0.42.0+das.opt1.dtk2504-py3-none-any.whl
+```
+## 数据集
+```
+/home/axolotl/
+    |── axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample
+    |── Rajesh1505/finance-alpaca-1k-test
+    |── tatsu-lab/alpaca
+    └── skrishna/gsm8k_only_answer
+```
+HF下载地址为[axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample](https://huggingface.co/datasets/axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample)，后几个示例数据集较小，项目中已内置。
+## 训练
+预训练权重目录结构：
+```
+/home/axolotl/
+    └── Qwen/Qwen3-4B
+```
+### 单机多卡
+```
+cd /home/axolotl/
+export HF_ENDPOINT=https://hf-mirror.com # 解决报错：MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443)
+## Distil
+axolotl train examples/qwen3/qlora-fsdp-kd_qwen3-4b.yaml # 示例数据为作者利用第三方库distilabel离线推理获取logits数据
+## SFT
+axolotl train examples/qwen3/qlora-fsdp-kd_qwen3-4b.yaml
+## RL
+# 环境bug解决
+1、解决识别不到gsm8k_grpo模块的bug
+export PYTHONPATH=/home/axolotl/axolotl-cookbook/grpo:$PYTHONPATH # 添加gsm8k_grpo.py文件所在目录的绝对或相对路径
+2、解决grpo训练遇到vllm缺失current_count等属性bug
+export LLAMA_NN=0
+3、解决trl调研vllm推理不能多线程的bug
+/usr/local/lib/python3.10/dist-packages/trl/scripts/vllm_serve.py, line 67
+`RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method`
+在"os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn""之后添加以下代码，即：
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+import multiprocessing as mp
+try:
+    mp.set_start_method('spawn', force=True)
+except RuntimeError:
+    pass
+# 训练
+# 1、启动vllm推理，axolotl作者说trl的vllm需使用后两张卡以避免报错。
+HIP_VISIBLE_DEVICES=2,3 axolotl vllm-serve examples/qwen3/rl_qwen3-4b.yaml
+# 2、然后通过docker exec -it axo bash进入进行主目录，另起一个终端使用前两张卡运行训练代码，同一机器同一镜像内。
+HIP_VISIBLE_DEVICES=0,1 axolotl train examples/qwen3/rl_qwen3-4b.yaml --num-processes 2 # 若报缓存空间不足可删除缓存：rm -rf /root/.cache/*
+# 其它模型的.yaml文件编写方法以此类推，axolotl提供了目前最先进的GRPO、DAPO等强化学习方法。
+# 注：强化学习显存占用较大，若batchsize较大，vllm推理和RL训练这两边都很容易因显存不足而挂掉。
+```
+备注：
+1、项目中所有`vllm推理需要用到的ip: x.x.x.x`需要替换成读者机器的真实物理地址的ip才能正常进行推理。
+2、以上Distil步骤中的数据采用官方数据进行训练方法示范，对于自己的数据如何通过distilabel离线推理获取logits，项目中已提供[`make_teacher_model_logits_kd`](./make_teacher_model_logits_kd.py)供参考研究，示例数据集`Rajesh1505`制作的Distil数据保存位置为`Rajesh1505/default.parquet`，制作完成后需要根据不同的数据集特点另外重命名保存才能正常读取，本数据的最终保存地址为[`Rajesh1505_logits`](./Rajesh1505/finance-alpaca-1k-train/data/train-00000-of-00001.parquet)，读者的其它数据可根据本示例数据的制作方法自行研究以生成相应正确格式。
+```
+cd Rajesh1505 # 运行完make_teacher_model_logits_kd.py会生成default.parquet
+cp default.parquet finance-alpaca-1k-train/data/train-00000-of-00001.parquet # 不同数据集的放在方法不同，此处需要读者根据相应的数据集特点灵活思考并放置。
+```
+运行`make_teacher_model_logits_kd.py`前需要另起一个vllm镜像专供运行vllm推理，以项目中的权重`Qwen/Qwen3-4B`示例（实际教师模型可自行采用更大参数量的其它模型自行研究制作）：
+```
+vllm serve Qwen/Qwen3-4B --port 8000 --tensor-parallel-size 4
+# vllm serve Qwen/Qwen3-4B --port 8000 --enable-reasoning --reasoning-parser deepseek_r1 --tensor-parallel-size 4
+```
+若运行`make_teacher_model_logits_kd.py`遇到卡住不动可删除缓存解决：
+```
+rm -rf /root/.cache/distilabel/pipelines/*
+```
+若希望深入研究蒸馏数据的制作方法可参考`make_teacher_model_logits_kd.py`顶部注释的参考文档说明。
+3、对于vllm缺失current_count等属性bug的解决也可按如下方式解决：
+```
+# 首先：
+"/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 397, in load_weight
+将load_weights函数内容替换成https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2.py中的load_weights函数内容，目前vllm==0.9.0。
+# 然后：
+"/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/linear.py", line 220, in apply
+"/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 57, in apply
+将这两个apply函数的相应部分更改如下：
+# return torch.matmul(x, layer.weight)
+if x.shape[-1] == layer.weight.shape[-1]:
+    return torch.matmul(x, layer.weight.permute(1, 0))
+else:
+    return torch.matmul(x, layer.weight)
+```
+4、蒸馏（Distil）和强化学习（RL）的操作较为复杂，需多研究本文档和官方文档才能正常使用本项目，本步骤使用到的所有训练命令见[`qwen3`](./qwen3.sh)。
+## 推理
+`无`
+更多资料可参考源项目中的[`README_orgin`](./README_origin.md)。
+## result
+`无`
+训练过程loss效果示例：
+```
+# Distil
+{'loss': 0.1297, 'grad_norm': 0.015708118677139282, 'learning_rate': 9.314053963669245e-07, 'epoch': 0.99}
+# SFT
+{'loss': 24.8723, 'grad_norm': 33.37716293334961, 'learning_rate': 2e-05, 'epoch': 0.09}
+# RL
+{'loss': 0.0044, 'grad_norm': 57.25, 'learning_rate': 3e-08, 'num_tokens': 57335.0, 'completions/mean_length': 348.53125, 'completions/min_length': 3.0, 'completions/max_length': 512.0, 'completions/clipped_ratio': 0.53125, 'completions/mean_terminated_length': 163.2666778564453, 'completions/min_terminated_length': 3.0, 'completions/max_terminated_length': 504.0, 'rewards/correctness_reward_func/mean': 0.1875, 'rewards/correctness_reward_func/std': 0.5922891497612, 'rewards/int_reward_func/mean': 0.046875, 'rewards/int_reward_func/std': 0.1480722874403, 'rewards/strict_format_reward_func/mean': 0.0, 'rewards/strict_format_reward_func/std': 0.0, 'rewards/soft_format_reward_func/mean': 0.0, 'rewards/soft_format_reward_func/std': 0.0, 'rewards/xmlcount_reward_func/mean': 0.06262499839067459, 'rewards/xmlcount_reward_func/std': 0.16834138333797455, 'reward': 0.2970000207424164, 'reward_std': 0.1547679901123047, 'kl': 0.005860495170054492, 'clip_ratio/low_mean': 0.0, 'clip_ratio/low_min': 0.0, 'clip_ratio/high_mean': 0.0, 'clip_ratio/high_max': 0.0, 'clip_ratio/region_mean': 0.0, 'epoch': 0.01}
+```
+### 精度
+DCU与GPU精度一致，推理框架：pytorch。
+## 应用场景
+### 算法类别
+`对话问答`
+### 热点应用行业
+`制造,广媒,金融,能源,医疗,家居,教育`
+## 预训练权重
+魔搭社区下载地址为：[Qwen/Qwen3-4B](https://www.modelscope.cn/Qwen/Qwen3-4B.git)
+## 源码仓库及问题反馈
+- http://developer.sourcefind.cn/codes/modelzoo/axolotl-Qwen3-4B_pytorch.git
+## 参考资料
+- https://github.com/axolotl-ai-cloud/axolotl.git
+- http://docs.axolotl.ai/docs/rlhf.html#grpo
+- https://github.com/axolotl-ai-cloud/axolotl-cookbook.git
+- https://distilabel.argilla.io/latest/api/models/llm/llm_gallery
+- https://distilabel.argilla.io/latest/components-gallery/steps/loaddatafromhub
+- https://distilabel.argilla.io/latest/sections/how_to_guides/advanced/serving_an_llm_for_reuse
--- a/README_origin.md
+++ b/README_origin.md
+<p align="center">
+    <picture>
+        <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_white.svg">
+        <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg">
+        <img alt="Axolotl" src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
+    </picture>
+</p>
+<p align="center">
+    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
+    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
+    <br/>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
+    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
+    <br/>
+    <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
+    <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
+    <br/>
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
+</p>
+Axolotl is a tool designed to streamline post-training for various AI models.
+Post-training refers to any modifications or additional training performed on
+pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
+LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
+techniques. With support for multiple model architectures and training configurations,
+Axolotl makes it easy to get started with these techniques.
+Axolotl is designed to work with YAML config files that contain everything you need to
+preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
+and much more.
+Features:
+- Train various Huggingface models such as llama, pythia, falcon, mpt
+- Supports fullfinetune, lora, qlora, relora, and gptq
+- Customize configurations using a simple yaml file or CLI overwrite
+- Load different dataset formats, use custom formats, or bring your own tokenized datasets
+- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
+- Works with single GPU or multiple GPUs via FSDP or Deepspeed
+- Easily run with Docker locally or on the cloud
+- Log results and optionally checkpoints to wandb, mlflow or Comet
+- And more!
+## 🚀 Quick Start
+**Requirements**:
+- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
+- Python 3.11
+- PyTorch ≥2.4.1
+### Installation
+```bash
+pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+# Download example axolotl configs, deepspeed configs
+axolotl fetch examples
+axolotl fetch deepspeed_configs  # OPTIONAL
+```
+Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
+### Your First Fine-tune
+```bash
+# Fetch axolotl examples
+axolotl fetch examples
+# Or, specify a custom path
+axolotl fetch examples --dest path/to/folder
+# Train a model using LoRA
+axolotl train examples/llama-3/lora-1b.yml
+```
+That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
+## ✨ Key Features
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
+- **Easy Configuration**: Simple YAML files to control your training setup
+- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
+- **Flexible Dataset Handling**: Use various formats and custom datasets
+- **Cloud Ready**: Run on cloud platforms or local hardware
+## 📚 Documentation
+- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
+- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
+- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
+- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
+## 🤝 Getting Help
+- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
+- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
+- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
+- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
+## 🌟 Contributing
+Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
+## Supported Models
+|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
+|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
+| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
+| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
+| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
+| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
+| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+✅: supported
+❌: not supported
+❓: untested
+## ❤️ Sponsors
+Thank you to our sponsors who help make Axolotl possible:
+- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
+jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
+fine-tune large language models, run protein folding simulations, and much more.
+Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
+## 📜 License
+This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
--- a/Rajesh1505/finance-alpaca-1k-test/README.md
+++ b/Rajesh1505/finance-alpaca-1k-test/README.md
+---
+dataset_info:
+  features:
+  - name: instruction
+    dtype: string
+  - name: input
+    dtype: float64
+  - name: output
+    dtype: string
+  - name: text
+    dtype: float64
+  splits:
+  - name: test
+    num_bytes: 1118044
+    num_examples: 1000
+  download_size: 664700
+  dataset_size: 1118044
+configs:
+- config_name: default
+  data_files:
+  - split: test
+    path: data/test-*
+---
--- a/Rajesh1505/finance-alpaca-1k-test/data/test-00000-of-00001.parquet
+++ b/Rajesh1505/finance-alpaca-1k-test/data/test-00000-of-00001.parquet
--- a/Rajesh1505/finance-alpaca-1k-train/data/train-00000-of-00001.parquet
+++ b/Rajesh1505/finance-alpaca-1k-train/data/train-00000-of-00001.parquet
--- a/TODO.md
+++ b/TODO.md
+# todo list
+- [] Validation of parameters for combinations that won't work
+## things that are known not to work
+- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
+- adamw_bnb_8bit doesn't play well with FSDP offload
--- a/_quarto.yml
+++ b/_quarto.yml
+project:
+  type: website
+quartodoc:
+  dir: docs/api
+  package: axolotl
+  title: API Reference
+  parser: google
+  sections:
+    - title: Core
+      desc: Core functionality for training
+      contents:
+        - train
+        - evaluate
+        - datasets
+        - convert
+        - prompt_tokenizers
+        - logging_config
+        - core.trainer_builder
+        - core.training_args
+        - core.chat.messages
+        - core.chat.format.chatml
+        - core.chat.format.llama3x
+        - core.chat.format.shared
+        - core.datasets.chat
+        - core.datasets.transforms.chat_builder
+    - title: CLI
+      desc: Command-line interface
+      contents:
+        - cli.main
+        - cli.train
+        - cli.evaluate
+        - cli.args
+        - cli.checks
+        - cli.config
+        - cli.inference
+        - cli.merge_lora
+        - cli.merge_sharded_fsdp_weights
+        - cli.preprocess
+        - cli.sweeps
+        - cli.utils
+        - cli.vllm_serve
+        - cli.cloud.base
+        - cli.cloud.modal_
+        - cli.quantize
+    - title: Trainers
+      desc: Training implementations
+      contents:
+        - core.trainers.base
+        - core.trainers.trl
+        - core.trainers.mamba
+        - core.trainers.relora
+        - core.trainers.dpo.trainer
+        - core.trainers.grpo.trainer
+        - core.trainers.grpo.sampler
+        - core.trainers.utils
+    - title: Model Loading
+      desc: Functionality for loading and patching models, tokenizers, etc.
+      contents:
+        - loaders.model
+        - loaders.tokenizer
+        - loaders.processor
+        - loaders.adapter
+        - loaders.patch_manager
+        - loaders.constants
+    - title: Mixins
+      desc: Mixin classes for augmenting trainers
+      contents:
+        - core.trainers.mixins.optimizer
+        - core.trainers.mixins.rng_state_loader
+        - core.trainers.mixins.scheduler
+    - title: Context Managers
+      desc: Context managers for altering trainer behaviors
+      contents:
+        - utils.ctx_managers.sequence_parallel
+    - title: Prompt Strategies
+      desc: Prompt formatting strategies
+      contents:
+        - prompt_strategies.base
+        - prompt_strategies.chat_template
+        - prompt_strategies.alpaca_chat
+        - prompt_strategies.alpaca_instruct
+        - prompt_strategies.alpaca_w_system
+        - prompt_strategies.user_defined
+        - prompt_strategies.llama2_chat
+        - prompt_strategies.completion
+        - prompt_strategies.input_output
+        - prompt_strategies.stepwise_supervised
+        - prompt_strategies.metharme
+        - prompt_strategies.orcamini
+        - prompt_strategies.pygmalion
+        - prompt_strategies.messages.chat
+        - prompt_strategies.dpo.chat_template
+        - prompt_strategies.dpo.llama3
+        - prompt_strategies.dpo.chatml
+        - prompt_strategies.dpo.zephyr
+        - prompt_strategies.dpo.user_defined
+        - prompt_strategies.dpo.passthrough
+        - prompt_strategies.kto.llama3
+        - prompt_strategies.kto.chatml
+        - prompt_strategies.kto.user_defined
+        - prompt_strategies.orpo.chat_template
+        - prompt_strategies.bradley_terry.llama3
+    - title: Kernels
+      desc: Low-level performance optimizations
+      contents:
+        - kernels.lora
+        - kernels.geglu
+        - kernels.swiglu
+        - kernels.quantize
+        - kernels.utils
+    - title: Monkey Patches
+      desc: Runtime patches for model optimizations
+      contents:
+        - monkeypatch.llama_attn_hijack_flash
+        - monkeypatch.llama_attn_hijack_xformers
+        - monkeypatch.mistral_attn_hijack_flash
+        - monkeypatch.multipack
+        - monkeypatch.relora
+        - monkeypatch.llama_expand_mask
+        - monkeypatch.lora_kernels
+        - monkeypatch.utils
+        - monkeypatch.btlm_attn_hijack_flash
+        - monkeypatch.llama_patch_multipack
+        - monkeypatch.stablelm_attn_hijack_flash
+        - monkeypatch.trainer_fsdp_optim
+        - monkeypatch.transformers_fa_utils
+        - monkeypatch.unsloth_
+        - monkeypatch.attention.mllama
+        - monkeypatch.data.batch_dataset_fetcher
+        - monkeypatch.mixtral
+        - monkeypatch.gradient_checkpointing.offload_cpu
+        - monkeypatch.gradient_checkpointing.offload_disk
+    - title: Utils
+      desc: Utility functions
+      contents:
+        - utils.tokenization
+        - utils.chat_templates
+        - utils.lora
+        - utils.model_shard_quant
+        - utils.bench
+        - utils.freeze
+        - utils.trainer
+        - utils.schedulers
+        - utils.distributed
+        - utils.dict
+        - utils.optimizers.adopt
+        - utils.data.pretraining
+        - utils.data.sft
+        - utils.quantization
+    - title: Schemas
+      desc: Pydantic data models for Axolotl config
+      contents:
+        - utils.schemas.config
+        - utils.schemas.model
+        - utils.schemas.training
+        - utils.schemas.datasets
+        - utils.schemas.peft
+        - utils.schemas.trl
+        - utils.schemas.multimodal
+        - utils.schemas.integrations
+        - utils.schemas.enums
+        - utils.schemas.utils
+    - title: Integrations
+      desc: Third-party integrations and extensions
+      contents:
+        - integrations.base
+        - integrations.cut_cross_entropy.args
+        - integrations.grokfast.optimizer
+        - integrations.kd.trainer
+        - integrations.liger.args
+        - integrations.lm_eval.args
+        - integrations.spectrum.args
+    - title: Common
+      desc: Common utilities and shared functionality
+      contents:
+        - common.architectures
+        - common.const
+        - common.datasets
+    - title: Models
+      desc: Custom model implementations
+      contents:
+        - models.mamba.modeling_mamba
+    - title: Data Processing
+      desc: Data processing utilities
+      contents:
+        - utils.collators.core
+        - utils.collators.batching
+        - utils.collators.mamba
+        - utils.collators.mm_chat
+        - utils.samplers.multipack
+    - title: Callbacks
+      desc: Training callbacks
+      contents:
+        - utils.callbacks.perplexity
+        - utils.callbacks.profiler
+        - utils.callbacks.lisa
+        - utils.callbacks.mlflow_
+        - utils.callbacks.comet_
+        - utils.callbacks.qat
+website:
+  title: "Axolotl"
+  description: "We make fine-tuning accessible, scalable, and fun"
+  favicon: favicon.jpg
+  google-analytics: "G-9KYCVJBNMQ"
+  navbar:
+    logo: image/axolotl_logo_digital_white.svg
+    title: false
+    background: dark
+    pinned: false
+    collapse: false
+    tools:
+    - icon: twitter
+      href: https://twitter.com/axolotl_ai
+    - icon: github
+      href: https://github.com/axolotl-ai-cloud/axolotl/
+    - icon: discord
+      href: https://discord.gg/7m9sfhzaf3
+  sidebar:
+      pinned: true
+      collapse-level: 2
+      style: docked
+      contents:
+        - text: Home
+          href: index.qmd
+        - section: "Getting Started"
+          contents:
+            - docs/getting-started.qmd
+            - docs/installation.qmd
+            - docs/inference.qmd
+            - docs/cli.qmd
+            - docs/config.qmd
+            - text: "API Reference"
+              href: docs/api
+        - section: "Dataset Formats"
+          contents: docs/dataset-formats/*
+        - section: "Deployments"
+          contents:
+            - docs/docker.qmd
+            - docs/multi-gpu.qmd
+            - docs/multi-node.qmd
+            - docs/ray-integration.qmd
+            - docs/amd_hpc.qmd
+            - docs/mac.qmd
+        - section: "How To Guides"
+          contents:
+            - docs/multimodal.qmd
+            - docs/rlhf.qmd
+            - docs/reward_modelling.qmd
+            - docs/lr_groups.qmd
+            - docs/lora_optims.qmd
+            - docs/dataset_loading.qmd
+            - docs/qat.qmd
+            - docs/quantize.qmd
+        - section: "Core Concepts"
+          contents:
+            - docs/batch_vs_grad.qmd
+            - docs/dataset_preprocessing.qmd
+            - docs/multipack.qmd
+        - section: "Advanced Features"
+          contents:
+            - docs/fsdp_qlora.qmd
+            - docs/unsloth.qmd
+            - docs/torchao.qmd
+            - docs/custom_integrations.qmd
+            - docs/sequence_parallelism.qmd
+        - section: "Troubleshooting"
+          contents:
+            - docs/faq.qmd
+            - docs/debugging.qmd
+            - docs/nccl.qmd
+format:
+  html:
+    theme: darkly
+    css: styles.css
+    toc: true
+    # Enable better handling of line breaks in markdown
+    preserve-tabs: true
+    html-math-method: mathjax
+    # Improved markdown processing options
+    md-extensions:
+      - markdown_it
+      - def_list
+      - attr_list
+      - fenced_divs
+      - tables
+      - html_admonition
+      - lineblocks
+      - fancy_lists
+    # Control whitespace handling
+    whitespace: preserve
+    # Process newlines in paragraphs
+    wrap: preserve
+    # Better line break handling
+    preserve-linebreaks: true
--- a/axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample/README.md
+++ b/axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample/README.md
--- a/axolotl-cookbook/LICENSE
+++ b/axolotl-cookbook/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/axolotl-cookbook/README.md
+++ b/axolotl-cookbook/README.md
+# axolotl-cookbook
\ No newline at end of file
--- a/axolotl-cookbook/examples/talk_like_a_pirate/README.md
+++ b/axolotl-cookbook/examples/talk_like_a_pirate/README.md
+# Talk like a Pirate
+Built on a synthetic dataset from Gemini Flash 1.5, we samplee 10,000 rows from ultrachat-200k to respond like a pirate.
+Dataset: [winglian/pirate-ultrachat-10k](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k)
+WandB: https://wandb.ai/axolotl-ai/pirate-ultrachat-llama31
+Model (LoRA adapter) [winglian/llama-3.1-8b-talk-like-a-pirate][https://huggingface.co/winglian/llama-3.1-8b-talk-like-a-pirate]
+### Hardware
+With a single L40S GPU (48GB), this model will train in approximately 6 hours. See [config](https://wandb.ai/axolotl-ai/pirate-ultrachat-llama31/runs/ux2ukksw/files/tmp/axolotl_config_tc5xe_jx.yml).
+For Multi-GPU, there is additional memory overhead needed to use DDP, thus we need to employ DeepSpeed ZeRO-2 in order to 
+finetune the model with OOM-ing.