v1.0

4d4d8f59 · chenzk · 4d4d8f59 · 4d4d8f59 · 4d4d8f59 · 4d4d8f59
Commit 4d4d8f59 authored Jun 04, 2025 by chenzk
20 changed files
--- a/bug.txt
+++ b/bug.txt
+axolotl/src/axolotl/integrations/kd/trainer.py, line77
+if num_items_in_batch is None:
+    num_items_in_batch = -1
+
+
+/usr/local/lib/python3.10/dist-packages/trl/scripts/vllm_serve.py, line 67
+
+add after "os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn""
+```
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+import multiprocessing as mp
+try:
+    mp.set_start_method('spawn', force=True)
+    print("spawned")
+except RuntimeError:
+    pass
+```
+
+
+"/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 397, in load_weights
+```
+load_weights -> https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2.py # vllm==0.9.0
+```
+"/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/linear.py", line 220, in apply
+"/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 57, in apply
+```
+# return torch.matmul(x, layer.weight)
+                if x.shape[-1] == layer.weight.shape[-1]:
+                    return torch.matmul(x, layer.weight.permute(1, 0))
+                else:
+                    return torch.matmul(x, layer.weight)
+```
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
+FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN pip install packaging==23.2 setuptools==75.8.0
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py | sh
+RUN python scripts/cutcrossentropy_install.py | sh
+
+# So we can test the Docker image
+RUN pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/cicd/__init__.py
+++ b/cicd/__init__.py
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
+#!/bin/bash
+set -e
+
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
+
+# Run unit tests with initial coverage report
+pytest -v --durations=10 -n8 \
+  --ignore=tests/e2e/ \
+  --ignore=tests/patched/ \
+  --ignore=tests/cli \
+  /workspace/axolotl/tests/ \
+  --cov=axolotl
+
+# Run lora kernels tests with coverage append
+pytest -v --durations=10 \
+  /workspace/axolotl/tests/e2e/patched/lora_kernels \
+  --cov=axolotl \
+  --cov-append
+
+# Run patched tests excluding lora kernels with coverage append
+pytest --full-trace -vvv --durations=10 \
+  --ignore=tests/e2e/patched/lora_kernels \
+  /workspace/axolotl/tests/e2e/patched \
+  --cov=axolotl \
+  --cov-append
+
+# Run solo tests with coverage append
+pytest -v --durations=10 -n1 \
+  /workspace/axolotl/tests/e2e/solo/ \
+  --cov=axolotl \
+  --cov-append
+
+# Run integration tests with coverage append
+pytest -v --durations=10 \
+  /workspace/axolotl/tests/e2e/integrations/ \
+  --cov=axolotl \
+  --cov-append
+
+pytest -v --durations=10 /workspace/axolotl/tests/cli \
+  --cov=axolotl \
+  --cov-append
+
+# Run remaining e2e tests with coverage append and final report
+pytest -v --durations=10 \
+  --ignore=tests/e2e/solo/ \
+  --ignore=tests/e2e/patched/ \
+  --ignore=tests/e2e/multigpu/ \
+  --ignore=tests/e2e/integrations/ \
+  --ignore=tests/cli \
+  /workspace/axolotl/tests/e2e/ \
+  --cov=axolotl \
+  --cov-append \
+  --cov-report=xml:e2e-coverage.xml
+
+codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
+"""Modal app to run axolotl GPU cleanup"""
+
+from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
+
+
+@app.function(
+    image=cicd_image,
+    timeout=60 * 60,
+    cpu=8.0,
+    memory=131072,
+    volumes=VOLUME_CONFIG,
+)
+def cleanup():
+    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
+
+
+@app.local_entrypoint()
+def main():
+    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
+#!/bin/bash
+set -e
+
+# cleanup old cache files for datasets processing and intermediate mappings
+find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
+find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
+"""Modal app to run axolotl GPU tests"""
+
+from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
+
+
+@app.function(
+    image=cicd_image,
+    gpu=GPU_CONFIG,
+    timeout=90 * 60,  # 90 min
+    cpu=8.0,
+    memory=131072,
+    volumes=VOLUME_CONFIG,
+)
+def cicd_pytest():
+    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
+
+
+@app.local_entrypoint()
+def main():
+    cicd_pytest.remote()
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
+"""
+modal application to run axolotl gpu tests in Modal
+"""
+
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import App, Image
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = Image.from_dockerfile(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    force_build=True,
+    gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 2))
+GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+
+
+@app.function(
+    image=cicd_image,
+    gpu=GPU_CONFIG,
+    timeout=90 * 60,
+    cpu=16.0,
+    memory=131072 * N_GPUS,
+    volumes=VOLUME_CONFIG,
+)
+def cicd_pytest():
+    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
+
+
+@app.local_entrypoint()
+def main():
+    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
+#!/bin/bash
+set -e
+
+# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
+pytest -v -n2 \
+  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
+  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
+  /workspace/axolotl/tests/e2e/multigpu/ \
+  --cov=axolotl
+
+# Run solo tests with coverage append
+pytest -v --durations=10 -n1 \
+  /workspace/axolotl/tests/e2e/multigpu/solo/ \
+  --cov=axolotl \
+  --cov-append
+
+pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
+  --cov=axolotl \
+  --cov-append \
+  --cov-report=xml:multigpu-coverage.xml
+
+# Upload coverage to Codecov
+codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
+"""Modal app to run axolotl GPU tests"""
+
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import App, Image
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = Image.from_dockerfile(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    context_mount=None,
+    force_build=True,
+    gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
+codecov:
+  require_ci_to_pass: yes
+  notify:
+    wait_for_ci: true
+
+coverage:
+  precision: 2
+  round: down
+  range: "70...100"
+  status:
+    project:
+      default:
+        # basic
+        target: auto
+        threshold: 0%
+        base: auto
+        # advanced
+        branches: null
+        if_no_uploads: error
+        if_not_found: success
+        if_ci_failed: error
+        only_pulls: true
+        flags: null
+        paths: null
+    patch:
+      default:
+        # basic
+        target: auto
+        threshold: 0%
+        base: auto
+        # advanced
+        branches: null
+        if_no_uploads: error
+        if_not_found: success
+        if_ci_failed: error
+        only_pulls: false
+        flags: null
+        paths: null
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: yes
+      loop: yes
+      method: no
+      macro: no
+
+comment:
+  layout: "reach,diff,flags,files,footer"
+  behavior: default
+  require_changes: no
+  require_base: no
+  require_head: yes
+
+github_checks:
+  annotations: false
--- a/deepspeed_configs/zero1.json
+++ b/deepspeed_configs/zero1.json
+{
+  "zero_optimization": {
+    "stage": 1,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero1_torch_compile.json
+++ b/deepspeed_configs/zero1_torch_compile.json
+{
+  "zero_optimization": {
+    "stage": 1,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "compile": {
+    "disable": false,
+    "backend": "inductor"
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero2.json
+++ b/deepspeed_configs/zero2.json
+{
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "contiguous_gradients": true,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
+{
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
+{
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+{
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_allow_untested_optimizer": true,
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+{
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_allow_untested_optimizer": true,
+  "zero_optimization": {
+    "stage": 3,
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/devtools/README.md
+++ b/devtools/README.md
+This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
+# Example config for debugging the chat_template prompt format
+base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+    shards: 10
+val_set_size: 0
+output_dir: temp_debug/axolotl_outputs/model
+dataset_prepared_path: temp_debug/axolotl_outputs/data
+dataset_processes: 1
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+micro_batch_size: 1
+num_epochs: 1
+max_steps: 10
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 10
+weight_decay: 0.0