update

0b5cd1a0 · liangjing · 5352a639 · 0b5cd1a0 · 0b5cd1a0 · 0b5cd1a0
Commit 0b5cd1a0 authored Nov 06, 2024 by liangjing
20 changed files
--- a/unsloth/images/unsloth end.png
+++ b/unsloth/images/unsloth end.png
--- a/unsloth/images/unsloth loading page render.png
+++ b/unsloth/images/unsloth loading page render.png
--- a/unsloth/images/unsloth logo black text.png
+++ b/unsloth/images/unsloth logo black text.png
--- a/unsloth/images/unsloth logo only.png
+++ b/unsloth/images/unsloth logo only.png
--- a/unsloth/images/unsloth logo white text.png
+++ b/unsloth/images/unsloth logo white text.png
--- a/unsloth/images/unsloth made with love.png
+++ b/unsloth/images/unsloth made with love.png
--- a/unsloth/images/unsloth new logo.png
+++ b/unsloth/images/unsloth new logo.png
--- a/unsloth/pyproject.toml
+++ b/unsloth/pyproject.toml
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "unsloth"
+dynamic = ["version"]
+description = "2-5X faster LLM finetuning"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {file = "LICENSE"}
+keywords = ["ai", "llm",]
+authors = [
+    {email = "info@unsloth.ai"},
+    {name = "Unsloth AI team"},
+]
+maintainers = [
+    {name = "Daniel Han", email = "danielhanchen@gmail.com"},
+    {name = "Michael Han", email = "info@unsloth.ai"},
+]
+classifiers = [
+    "Programming Language :: Python",
+]
+[tool.setuptools.dynamic]
+version = {attr = "unsloth.models._utils.__version__"}
+[tool.setuptools]
+include-package-data = false
+[tool.setuptools.packages.find]
+exclude = ["images*"]
+[project.optional-dependencies]
+huggingface = [
+    "unsloth_zoo",
+    "packaging",
+    "tyro",
+    "transformers>=4.44.2",
+    "datasets>=2.16.0",
+    "sentencepiece>=0.2.0",
+    "tqdm",
+    "psutil",
+    "wheel>=0.42.0",
+    "numpy",
+    "accelerate>=0.34.1",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
+    "peft>=0.7.1,!=0.11.0",
+    "protobuf<4.0.0",
+    "huggingface_hub",
+    "hf_transfer",
+]
+cu118only = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu121only = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu118onlytorch211 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu121onlytorch211 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu118onlytorch212 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu121onlytorch212 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu118onlytorch220 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu121onlytorch220 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu118onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
+]
+cu121onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
+]
+cu118onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
+]
+cu121onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
+]
+cu124onlytorch240 = [
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
+]
+cu121onlytorch250 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
+]
+cu124onlytorch250 = [
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
+    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
+]
+cu118 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118only]",
+]
+cu121 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121only]",
+]
+cu118-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch211]",
+]
+cu121-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+]
+cu118-torch212 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch212]",
+]
+cu121-torch212 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch212]",
+]
+cu118-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch220]",
+]
+cu121-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+]
+cu118-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch230]",
+]
+cu121-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch230]",
+]
+cu118-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch240]",
+]
+cu121-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch240]",
+]
+cu121-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch250]",
+]
+cu124-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch240]",
+]
+cu124-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch250]",
+]
+kaggle = [
+    "unsloth[huggingface]",
+]
+kaggle-new = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+]
+conda = [
+    "unsloth[huggingface]",
+]
+colab-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+]
+colab-ampere-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+colab-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+]
+colab-ampere-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+colab-new = [
+    "unsloth_zoo",
+    "packaging",
+    "tyro",
+    "transformers>=4.44.2",
+    "datasets>=2.16.0",
+    "sentencepiece>=0.2.0",
+    "tqdm",
+    "psutil",
+    "wheel>=0.42.0",
+    "numpy",
+    "protobuf<4.0.0",
+    "huggingface_hub",
+    "hf_transfer",
+    "bitsandbytes>=0.43.3",
+]
+colab-no-deps = [
+    "accelerate>=0.34.1",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
+    "peft>=0.7.1",
+    "xformers<0.0.27",
+    "bitsandbytes>=0.43.3",
+    "protobuf<4.0.0",
+]
+colab = [
+    "unsloth[cu121]",
+]
+colab-ampere = [
+    "unsloth[colab-ampere-torch220]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu118-ampere = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118only]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121only]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu118-ampere-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch211]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere-torch211 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch211]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu118-ampere-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch220]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere-torch220 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch220]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu118-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch230]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch230]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu118-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu118onlytorch240]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch240]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu121-ampere-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu121onlytorch250]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu124-ampere-torch240 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch240]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+cu124-ampere-torch250 = [
+    "unsloth[huggingface]",
+    "bitsandbytes>=0.43.3",
+    "unsloth[cu124onlytorch250]",
+    "packaging",
+    "ninja",
+    "flash-attn>=2.6.3",
+]
+[project.urls]
+homepage = "http://www.unsloth.ai"
+documentation = "https://github.com/unslothai/unsloth"
+repository = "https://github.com/unslothai/unsloth"
--- a/unsloth/unsloth-cli.py
+++ b/unsloth/unsloth-cli.py
+#!/usr/bin/env python3
+"""
+🦥 Starter Script for Fine-Tuning FastLanguageModel with Unsloth
+This script is designed as a starting point for fine-tuning your models using unsloth.
+It includes configurable options for model loading, PEFT parameters, training arguments, 
+and model saving/pushing functionalities.
+You will likely want to customize this script to suit your specific use case 
+and requirements.
+Here are a few suggestions for customization:
+    - Modify the dataset loading and preprocessing steps to match your data.
+    - Customize the model saving and pushing configurations.
+Usage: (most of the options have valid default values this is an extended example for demonstration purposes)
+    python unsloth-cli.py --model_name "unsloth/llama-3-8b" --max_seq_length 8192 --dtype None --load_in_4bit \
+    --r 64 --lora_alpha 32 --lora_dropout 0.1 --bias "none" --use_gradient_checkpointing "unsloth" \
+    --random_state 3407 --use_rslora --per_device_train_batch_size 4 --gradient_accumulation_steps 8 \
+    --warmup_steps 5 --max_steps 400 --learning_rate 2e-6 --logging_steps 1 --optim "adamw_8bit" \
+    --weight_decay 0.005 --lr_scheduler_type "linear" --seed 3407 --output_dir "outputs" \
+    --report_to "tensorboard" --save_model --save_path "model" --quantization_method "f16" \
+    --push_model --hub_path "hf/model" --hub_token "your_hf_token"
+To see a full list of configurable options, use:
+    python unsloth-cli.py --help
+Happy fine-tuning!
+"""
+import argparse
+def run(args):
+    import torch
+    from unsloth import FastLanguageModel
+    from datasets import load_dataset
+    from trl import SFTTrainer
+    from transformers import TrainingArguments
+    from unsloth import is_bfloat16_supported
+    import logging
+    logging.getLogger('hf-to-gguf').setLevel(logging.WARNING)
+    # Load model and tokenizer
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=args.model_name,
+        max_seq_length=args.max_seq_length,
+        dtype=args.dtype,
+        load_in_4bit=args.load_in_4bit,
+    )
+    # Configure PEFT model
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=args.r,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        bias=args.bias,
+        use_gradient_checkpointing=args.use_gradient_checkpointing,
+        random_state=args.random_state,
+        use_rslora=args.use_rslora,
+        loftq_config=args.loftq_config,
+    )
+    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+    ### Instruction:
+    {}
+    ### Input:
+    {}
+    ### Response:
+    {}"""
+    EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
+    def formatting_prompts_func(examples):
+        instructions = examples["instruction"]
+        inputs       = examples["input"]
+        outputs      = examples["output"]
+        texts = []
+        for instruction, input, output in zip(instructions, inputs, outputs):
+            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
+            texts.append(text)
+        return {"text": texts}
+    # Load and format dataset
+    dataset = load_dataset(args.dataset, split="train")
+    dataset = dataset.map(formatting_prompts_func, batched=True)
+    print("Data is formatted and ready!")
+    # Configure training arguments
+    training_args = TrainingArguments(
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        warmup_steps=args.warmup_steps,
+        max_steps=args.max_steps,
+        learning_rate=args.learning_rate,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        logging_steps=args.logging_steps,
+        optim=args.optim,
+        weight_decay=args.weight_decay,
+        lr_scheduler_type=args.lr_scheduler_type,
+        seed=args.seed,
+        output_dir=args.output_dir,
+        report_to=args.report_to,
+    )
+    # Initialize trainer
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=args.max_seq_length,
+        dataset_num_proc=2,
+        packing=False,
+        args=training_args,
+    )
+    # Train model
+    trainer_stats = trainer.train()
+    # Save model
+    if args.save_model:
+        # if args.quantization_method is a list, we will save the model for each quantization method
+        if args.save_gguf:
+            if isinstance(args.quantization, list):
+                for quantization_method in args.quantization:
+                    print(f"Saving model with quantization method: {quantization_method}")
+                    model.save_pretrained_gguf(
+                        args.save_path,
+                        tokenizer,
+                        quantization_method=quantization_method,
+                    )
+                    if args.push_model:
+                        model.push_to_hub_gguf(
+                            hub_path=args.hub_path,
+                            hub_token=args.hub_token,
+                            quantization_method=quantization_method,
+                        )
+            else:
+                print(f"Saving model with quantization method: {args.quantization}")
+                model.save_pretrained_gguf(args.save_path, tokenizer, quantization_method=args.quantization)
+                if args.push_model:
+                    model.push_to_hub_gguf(
+                        hub_path=args.hub_path,
+                        hub_token=args.hub_token,
+                        quantization_method=quantization_method,
+                    )
+        else:
+            model.save_pretrained_merged(args.save_path, tokenizer, args.save_method)
+            if args.push_model:
+                model.push_to_hub_merged(args.save_path, tokenizer, args.hub_token)
+    else:
+        print("Warning: The model is not saved!")
+if __name__ == "__main__":
+    # Define argument parser
+    parser = argparse.ArgumentParser(description="🦥 Fine-tune your llm faster using unsloth!")
+    model_group = parser.add_argument_group("🤖 Model Options")
+    model_group.add_argument('--model_name', type=str, default="unsloth/llama-3-8b", help="Model name to load")
+    model_group.add_argument('--max_seq_length', type=int, default=2048, help="Maximum sequence length, default is 2048. We auto support RoPE Scaling internally!")
+    model_group.add_argument('--dtype', type=str, default=None, help="Data type for model (None for auto detection)")
+    model_group.add_argument('--load_in_4bit', action='store_true', help="Use 4bit quantization to reduce memory usage")
+    model_group.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned", help="Huggingface dataset to use for training")
+    lora_group = parser.add_argument_group("🧠 LoRA Options", "These options are used to configure the LoRA model.")
+    lora_group.add_argument('--r', type=int, default=16, help="Rank for Lora model, default is 16.  (common values: 8, 16, 32, 64, 128)")
+    lora_group.add_argument('--lora_alpha', type=int, default=16, help="LoRA alpha parameter, default is 16. (common values: 8, 16, 32, 64, 128)")
+    lora_group.add_argument('--lora_dropout', type=float, default=0, help="LoRA dropout rate, default is 0.0 which is optimized.")
+    lora_group.add_argument('--bias', type=str, default="none", help="Bias setting for LoRA")
+    lora_group.add_argument('--use_gradient_checkpointing', type=str, default="unsloth", help="Use gradient checkpointing")
+    lora_group.add_argument('--random_state', type=int, default=3407, help="Random state for reproducibility, default is 3407.")
+    lora_group.add_argument('--use_rslora', action='store_true', help="Use rank stabilized LoRA")
+    lora_group.add_argument('--loftq_config', type=str, default=None, help="Configuration for LoftQ")
+    training_group = parser.add_argument_group("🎓 Training Options")
+    training_group.add_argument('--per_device_train_batch_size', type=int, default=2, help="Batch size per device during training, default is 2.")
+    training_group.add_argument('--gradient_accumulation_steps', type=int, default=4, help="Number of gradient accumulation steps, default is 4.")
+    training_group.add_argument('--warmup_steps', type=int, default=5, help="Number of warmup steps, default is 5.")
+    training_group.add_argument('--max_steps', type=int, default=400, help="Maximum number of training steps.")
+    training_group.add_argument('--learning_rate', type=float, default=2e-4, help="Learning rate, default is 2e-4.")
+    training_group.add_argument('--optim', type=str, default="adamw_8bit", help="Optimizer type.")
+    training_group.add_argument('--weight_decay', type=float, default=0.01, help="Weight decay, default is 0.01.")
+    training_group.add_argument('--lr_scheduler_type', type=str, default="linear", help="Learning rate scheduler type, default is 'linear'.")
+    training_group.add_argument('--seed', type=int, default=3407, help="Seed for reproducibility, default is 3407.")
+    # Report/Logging arguments
+    report_group = parser.add_argument_group("📊 Report Options")
+    report_group.add_argument('--report_to', type=str, default="tensorboard",
+        choices=["azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", "wandb", "all", "none"],
+        help="The list of integrations to report the results and logs to. Supported platforms are: \n\t\t 'azure_ml', 'clearml', 'codecarbon', 'comet_ml', 'dagshub', 'dvclive', 'flyte', 'mlflow', 'neptune', 'tensorboard', and 'wandb'. Use 'all' to report to all integrations installed, 'none' for no integrations.")
+    report_group.add_argument('--logging_steps', type=int, default=1, help="Logging steps, default is 1")
+    # Saving and pushing arguments
+    save_group = parser.add_argument_group('💾 Save Model Options')
+    save_group.add_argument('--output_dir', type=str, default="outputs", help="Output directory")
+    save_group.add_argument('--save_model', action='store_true', help="Save the model after training")
+    save_group.add_argument('--save_method', type=str, default="merged_16bit", choices=["merged_16bit", "merged_4bit", "lora"], help="Save method for the model, default is 'merged_16bit'")
+    save_group.add_argument('--save_gguf', action='store_true', help="Convert the model to GGUF after training")
+    save_group.add_argument('--save_path', type=str, default="model", help="Path to save the model")
+    save_group.add_argument('--quantization', type=str, default="q8_0", nargs="+",
+        help="Quantization method for saving the model. common values ('f16', 'q4_k_m', 'q8_0'), Check our wiki for all quantization methods https://github.com/unslothai/unsloth/wiki#saving-to-gguf ")
+    push_group = parser.add_argument_group('🚀 Push Model Options')
+    push_group.add_argument('--push_model', action='store_true', help="Push the model to Hugging Face hub after training")
+    push_group.add_argument('--push_gguf', action='store_true', help="Push the model as GGUF to Hugging Face hub after training")
+    push_group.add_argument('--hub_path', type=str, default="hf/model", help="Path on Hugging Face hub to push the model")
+    push_group.add_argument('--hub_token', type=str, help="Token for pushing the model to Hugging Face hub")
+    args = parser.parse_args()
+    run(args)
--- a/unsloth/unsloth/__init__.py
+++ b/unsloth/unsloth/__init__.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings, importlib, sys
+from packaging.version import Version
+import os, re, subprocess, inspect
+import numpy as np
+# # Define a list of modules to check
+# MODULES_TO_CHECK = ["bitsandbytes"]
+# # Check if any of the modules in the list have been imported
+# for module in MODULES_TO_CHECK:
+#     if module in sys.modules:
+#         raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
+#     pass
+# pass
+# Check for unsloth_zoo
+try:
+    import unsloth_zoo
+except:
+    raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth-zoo`")
+pass
+# Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so
+# enabling it will require much more work, so we have to prioritize. Please understand!
+# We do have a beta version, which you can contact us about!
+# Thank you for your understanding and we appreciate it immensely!
+if "CUDA_VISIBLE_DEVICES" in os.environ:
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    devices = os.environ["CUDA_VISIBLE_DEVICES"]
+    # Check if there are multiple cuda devices set in env
+    if not devices.isdigit():
+        first_id = devices.split(",")[0]
+        warnings.warn(
+            f"Unsloth: 'CUDA_VISIBLE_DEVICES' is currently {devices} \n"\
+            "Unsloth currently does not support multi GPU setups - but we are working on it!\n"\
+            "Multiple CUDA devices detected but we require a single device.\n"\
+            f"We will override CUDA_VISIBLE_DEVICES to first device: {first_id}."
+        )
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(first_id)
+else:
+    # warnings.warn("Unsloth: 'CUDA_VISIBLE_DEVICES' is not set. We shall set it ourselves.")
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+pass
+# Reduce VRAM usage by reducing fragmentation
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+try:
+    import torch
+except ModuleNotFoundError:
+    raise ImportError(
+        "Unsloth: Pytorch is not installed. Go to https://pytorch.org/.\n"\
+        "We have some installation instructions on our Github page."
+    )
+except Exception as exception:
+    raise exception
+pass
+# Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
+keynames = "\n" + "\n".join(os.environ.keys())
+if "\nCOLAB_"  in keynames or "\nKAGGLE_" in keynames:
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+pass
+# We support Pytorch 2
+# Fixes https://github.com/unslothai/unsloth/issues/38
+torch_version = torch.__version__.split(".")
+major_torch, minor_torch = torch_version[0], torch_version[1]
+major_torch, minor_torch = int(major_torch), int(minor_torch)
+if (major_torch < 2):
+    raise ImportError("Unsloth only supports Pytorch 2 for now. Please update your Pytorch to 2.1.\n"\
+                      "We have some installation instructions on our Github page.")
+elif (major_torch == 2) and (minor_torch < 2):
+    # Disable expandable_segments
+    del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
+pass
+# Torch 2.4 has including_emulation
+major_version, minor_version = torch.cuda.get_device_capability()
+SUPPORTS_BFLOAT16 = (major_version >= 8)
+old_is_bf16_supported = torch.cuda.is_bf16_supported
+if "including_emulation" in str(inspect.signature(old_is_bf16_supported)):
+    def is_bf16_supported(including_emulation = False):
+        return old_is_bf16_supported(including_emulation)
+    torch.cuda.is_bf16_supported = is_bf16_supported
+else:
+    def is_bf16_supported(): return SUPPORTS_BFLOAT16
+    torch.cuda.is_bf16_supported = is_bf16_supported
+pass
+# Try loading bitsandbytes and triton
+import bitsandbytes as bnb
+if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
+    import triton
+    libcuda_dirs = lambda: None
+    if Version(triton.__version__) >= Version("3.0.0"):
+        try: from triton.backends.nvidia.driver import libcuda_dirs
+        except: pass
+    else: from triton.common.build import libcuda_dirs
+    try:
+        cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
+        libcuda_dirs()
+    except:
+        warnings.warn(
+            "Unsloth: Running `ldconfig /usr/lib64-nvidia` to link CUDA."\
+        )
+        if os.path.exists("/usr/lib64-nvidia"):
+            os.system("ldconfig /usr/lib64-nvidia")
+        elif os.path.exists("/usr/local"):
+            # Sometimes bitsandbytes cannot be linked properly in Runpod for example
+            possible_cudas = subprocess.check_output(["ls", "-al", "/usr/local"]).decode("utf-8").split("\n")
+            find_cuda = re.compile(r"[\s](cuda\-[\d\.]{2,})$")
+            possible_cudas = [find_cuda.search(x) for x in possible_cudas]
+            possible_cudas = [x.group(1) for x in possible_cudas if x is not None]
+            # Try linking cuda folder, or everything in local
+            if len(possible_cudas) == 0:
+                os.system("ldconfig /usr/local/")
+            else:
+                find_number = re.compile(r"([\d\.]{2,})")
+                latest_cuda = np.argsort([float(find_number.search(x).group(1)) for x in possible_cudas])[::-1][0]
+                latest_cuda = possible_cudas[latest_cuda]
+                os.system(f"ldconfig /usr/local/{latest_cuda}")
+        pass
+        importlib.reload(bnb)
+        importlib.reload(triton)
+        try:
+            libcuda_dirs = lambda: None
+            if Version(triton.__version__) >= Version("3.0.0"):
+                try: from triton.backends.nvidia.driver import libcuda_dirs
+                except: pass
+            else: from triton.common.build import libcuda_dirs
+            cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
+            libcuda_dirs()
+        except:
+            warnings.warn(
+                "Unsloth: CUDA is not linked properly.\n"\
+                "Try running `python -m bitsandbytes` then `python -m xformers.info`\n"\
+                "We tried running `ldconfig /usr/lib64-nvidia` ourselves, but it didn't work.\n"\
+                "You need to run in your terminal `sudo ldconfig /usr/lib64-nvidia` yourself, then import Unsloth.\n"\
+                "Also try `sudo ldconfig /usr/local/cuda-xx.x` - find the latest cuda version.\n"\
+                "Unsloth will still run for now, but maybe it might crash - let's hope it works!"
+            )
+    pass
+pass
+from .models import *
+from .save import *
+from .chat_templates import *
+from .tokenizer_utils import *
+from .trainer import *
--- a/unsloth/unsloth/_auto_install.py
+++ b/unsloth/unsloth/_auto_install.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try: import torch
+except: raise ImportError('Install torch via `pip install torch`')
+from packaging.version import Version as V
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!")
+if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
+elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
+elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
+elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
+elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
+elif v  < V('2.6.0'): x = 'cu{}{}-torch250'
+else: raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
\ No newline at end of file
--- a/unsloth/unsloth/chat_templates.py
+++ b/unsloth/unsloth/chat_templates.py
--- a/unsloth/unsloth/kernels/__init__.py
+++ b/unsloth/unsloth/kernels/__init__.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .cross_entropy_loss import (
+    fast_cross_entropy_loss,
+    patch_llama_for_causal_lm,
+    unpatch_llama_for_causal_lm,
+)
+from .rms_layernorm import (
+    fast_rms_layernorm,
+    patch_rms_layernorm,
+    unpatch_rms_layernorm,
+)
+from .layernorm import (
+    fast_layernorm,
+    patch_layernorm,
+    unpatch_layernorm,
+)
+from .rope_embedding import fast_rope_embedding, inplace_rope_embedding
+from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
+from .geglu import (
+    geglu_exact_forward_kernel,
+    geglu_exact_backward_kernel,
+    geglu_approx_forward_kernel,
+    geglu_approx_backward_kernel,
+)
+from .fast_lora import (
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    apply_lora_mlp_swiglu,
+    apply_lora_mlp_geglu_exact,
+    apply_lora_mlp_geglu_approx,
+    apply_lora_qkv,
+    apply_lora_o,
+)
+from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora
+from .flex_attention import (
+    HAS_FLEX_ATTENTION,
+    slow_attention_softcapping,
+    slow_inference_attention_softcapping,
+    create_flex_attention_causal_mask,
+    create_flex_attention_sliding_window_mask,
+)
+try:
+    print("🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.")
+except:
+    print("Unsloth: Will patch your computer to enable 2x faster free finetuning.")
+pass
--- a/unsloth/unsloth/kernels/cross_entropy_loss.py
+++ b/unsloth/unsloth/kernels/cross_entropy_loss.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, MAX_FUSED_SIZE, triton_tanh
+from transformers.models.llama.modeling_llama import logger
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: args["DO_SOFTCAPPING"  ],
+    "DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
+})
+@triton.jit
+def _cross_entropy_forward(
+    logits_ptr, logits_row_stride,
+    loss_ptr,
+    logsumexp_ptr,
+    labels_ptr,
+    VOCAB_SIZE      : tl.constexpr,
+    BLOCK_SIZE      : tl.constexpr,
+    DO_SOFTCAPPING  : tl.constexpr,
+    SOFTCAP         : tl.constexpr,
+    DO_LOGIT_SCALING: tl.constexpr,
+    LOGIT_SCALE     : tl.constexpr,
+):
+    """
+        Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
+        Pi = exp(xi) / sum(exp(xi))
+        CE_i = -y log(p) = -y log[ exp(x) / sum(exp(x)) ]
+             = -y [ x - log[sum(exp(x))] ]
+             = y * (log[sum(exp(x))] - x)
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+        logsumexp is also stable
+        Take    y =         log[sum(exp(x))]
+           exp(y) =             sum(exp(x))
+           exp(y) =             sum(exp(x - c)*exp(c)) Since e^(x-c)*e^c = e^x
+           exp(y) =      exp(c)*sum(exp(x - c))
+               y  = log(exp(c)*sum(exp(x - c)))
+               y  = c + log[sum(exp(x - c))]
+        This means we can set c = max(x) to make sure
+        exp(x - c) always is exp(x - max(x)).
+        This ensures exp(x - max(x))'s maximum is 1 as exp(0) = 1.
+    """
+    row_idx = tl.program_id(0)
+    logits_ptr    += row_idx * logits_row_stride.to(tl.int64)
+    loss_ptr      += row_idx
+    logsumexp_ptr += row_idx
+    labels_ptr    += row_idx
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    logits = logits.to(tl.float32)
+    c = tl.max(logits, 0)
+    logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
+    if label_idx != -100:
+        x = tl.load(logits_ptr + label_idx)
+        # Go logit scaling for Cohere: t * x
+        if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
+        # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+        if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
+        loss = logsumexp - x.to(tl.float32)
+    else:
+        loss = 0.0
+    tl.store(logsumexp_ptr, logsumexp)
+    tl.store(loss_ptr, loss)
+pass
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: args["DO_SOFTCAPPING"  ],
+    "DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
+})
+@triton.jit
+def _chunked_cross_entropy_forward(
+    logits_ptr, logits_row_stride,
+    loss_ptr,
+    logsumexp_ptr,
+    labels_ptr,
+    VOCAB_SIZE      : tl.constexpr,
+    N_CHUNKS        : tl.constexpr,
+    BLOCK_SIZE      : tl.constexpr,
+    DO_SOFTCAPPING  : tl.constexpr,
+    SOFTCAP         : tl.constexpr,
+    DO_LOGIT_SCALING: tl.constexpr,
+    LOGIT_SCALE     : tl.constexpr,
+):
+    """
+        256K vocab divided in 4 chunks
+        |-65536-| |-65536-| |-65536-| |-65536-|
+        |-------| |-------| |-------| |-------|
+        |-------| |-------| |-------| |-------|
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+        Notice we can do logsumexp for each chunk and then
+        logsumexp[chunk_sum(logsumexp)] == logsumexp
+        chunk_sum = log[chunk_sum(logsumexp)]
+                  = log[exp(logsumexp(a)) + ... + exp(logsumexp(z))]
+                  = log[exp(log[sum(exp(a))]) + ... + exp(log[sum(exp(z))])]
+                  = log[sum(exp(a)) + ... + sum(exp(z))]
+                  = logsumexp(x)
+        This means we can perform a logsumexp for each chunk, then do a
+        final logsumexp reduction!
+        Ie do: logsumexp(chunked_logsumexp) - x
+    """
+    row_idx   = tl.program_id(0)
+    chunk_idx = tl.program_id(1)
+    logits_ptr    += row_idx * logits_row_stride.to(tl.int64)
+    loss_ptr      += row_idx
+    logsumexp_ptr += row_idx * N_CHUNKS + chunk_idx
+    labels_ptr    += row_idx
+    col_offsets = chunk_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    logits = logits.to(tl.float32)
+    c = tl.max(logits, 0)
+    logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
+    if chunk_idx == 0:
+        # logsumexp(chunked_logsumexp) - x
+        # Do the -x separately
+        if label_idx != -100:
+            x = tl.load(logits_ptr + label_idx).to(tl.float32)
+            # Go logit scaling for Cohere: t * x
+            if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
+            # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+            if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
+            loss = -1.0 * x.to(tl.float32)
+        else:
+            loss = 0.0
+        tl.store(loss_ptr, loss)
+    pass
+    tl.store(logsumexp_ptr, logsumexp)
+pass
+@triton.heuristics({
+    "DO_SOFTCAPPING":   lambda args: args["DO_SOFTCAPPING"  ],
+    "DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
+})
+@triton.jit
+def _cross_entropy_backward(
+    logits_ptr, logits_row_stride,
+    dloss_ptr,   dloss_row_stride,
+    logsumexp_ptr,
+    labels_ptr,
+    VOCAB_SIZE      : tl.constexpr,
+    BLOCK_SIZE      : tl.constexpr,
+    DO_SOFTCAPPING  : tl.constexpr,
+    SOFTCAP         : tl.constexpr,
+    DO_LOGIT_SCALING: tl.constexpr,
+    LOGIT_SCALE     : tl.constexpr,
+):
+    """
+        CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
+        dC/dx = d/dx (y * log[sum(exp(x))] - x * y)
+        From https://en.wikipedia.org/wiki/LogSumExp
+        d/dx logsumexp = exp(x) / sum(exp(x)) = softmax(x)
+        dC/dx = y * exp(x) / sum(exp(x)) - d/dx (x * y)
+        dC/dx = y * exp[ log[exp(x) / sum(exp(x))] ] using x = exp(log(x)) trick
+        dC/dx = y * exp[x - logsumexp] - d/dx (x * y)
+        If y == 0: dC/dx = 0
+        If y == 1 and x == label: dC/dlabel = exp[x - logsumexp] - 1
+        If y == 1 and x != label: dC/dx     = exp[x - logsumexp]
+    """
+    row_idx   = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    logits_ptr += row_idx * logits_row_stride.to(tl.int64)
+    dloss_ptr  += row_idx *  dloss_row_stride
+    col_offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr + row_idx).to(tl.int32)
+    if label_idx != -100:
+        dloss = tl.load(dloss_ptr)
+    else:
+        dloss = 0.0
+    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    # Do logit scaling for Cohere
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        x = x * LOGIT_SCALE
+    pass
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        partial = triton_tanh(x / SOFTCAP)
+        x = SOFTCAP * partial
+    pass
+    logsumexp = tl.load(logsumexp_ptr + row_idx)
+    y = tl.exp(x.to(tl.float32) - logsumexp)
+    y = tl.where(
+        col_offsets == label_idx,
+        y - 1.0, # exp(x - logsumexp) - 1
+        y,       # exp(x - logsumexp)
+    )
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        y = y * LOGIT_SCALE
+    pass
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        y = y * (1.0 - partial*partial)
+    pass
+    # If y == 0: dC/dx = 0 ==> we already masked it to be = 0, so dloss = 0.
+    tl.store(logits_ptr + col_offsets, dloss * y, mask = mask)
+pass
+MAX_FUSED_SIZE = 16384 #65536 # 2**16
+class Fast_CrossEntropyLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, logits, labels, logit_softcapping = 0, logit_scaling = 0):
+        n_rows, vocab_size = logits.shape
+        div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
+        n_chunks = div + (mod != 0)
+        #losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+        losses = torch.empty(n_rows, dtype = torch.float32, device = logits.device)
+        DO_SOFTCAPPING   = (logit_softcapping != 0)
+        DO_LOGIT_SCALING = (logit_scaling != 0)
+        if n_chunks == 1:
+            # For small vocabs <= 65336 like Llama, Mistral
+            BLOCK_SIZE, num_warps = calculate_settings(vocab_size)
+            #logsumexp = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = logits.device)
+            _cross_entropy_forward[(n_rows,)](
+                logits, logits.stride(0),
+                losses,
+                logsumexp,
+                labels,
+                VOCAB_SIZE       = vocab_size,
+                BLOCK_SIZE       = BLOCK_SIZE,
+                DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                SOFTCAP          = logit_softcapping,
+                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                LOGIT_SCALE      = logit_scaling,
+                num_warps        = num_warps,
+            )
+        else:
+            # For large vocabs > 65336 like Gemma 256K
+            #logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = "cuda:0")
+            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = logits.device)
+            _chunked_cross_entropy_forward[(n_rows, n_chunks,)](
+                logits, logits.stride(0),
+                losses,
+                logsumexp,
+                labels,
+                VOCAB_SIZE       = vocab_size,
+                N_CHUNKS         = n_chunks,
+                BLOCK_SIZE       = MAX_FUSED_SIZE,
+                DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                SOFTCAP          = logit_softcapping,
+                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                LOGIT_SCALE      = logit_scaling,
+                num_warps        = 8,
+            )
+            # logsumexp(chunked_logsumexp) - x
+            # Do the -x separately
+            logsumexp = torch.logsumexp(logsumexp, dim = 1) # Row sum
+            losses += logsumexp
+            losses.masked_fill_(labels == -100, 0) # Don't forget to mask padding out!
+        pass
+        ctx.save_for_backward(logits, logsumexp, labels)
+        ctx.DO_SOFTCAPPING    = DO_SOFTCAPPING
+        ctx.logit_softcapping = logit_softcapping
+        ctx.DO_LOGIT_SCALING  = DO_LOGIT_SCALING
+        ctx.logit_scaling     = logit_scaling
+        return losses
+    pass
+    @staticmethod
+    def backward(ctx, dlosses):
+        logits, logsumexp, labels = ctx.saved_tensors
+        n_rows, vocab_size = logits.shape
+        BLOCK_SIZE = 4096
+        div, mod = divmod(vocab_size, BLOCK_SIZE)
+        n_blocks = div + (mod != 0)
+        _cross_entropy_backward[(n_rows, n_blocks,)](
+            logits,   logits.stride(0),
+            dlosses, dlosses.stride(0),
+            logsumexp,
+            labels,
+            VOCAB_SIZE       = vocab_size,
+            BLOCK_SIZE       = BLOCK_SIZE,
+            DO_SOFTCAPPING   = ctx.DO_SOFTCAPPING,
+            SOFTCAP          = ctx.logit_softcapping,
+            DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
+            LOGIT_SCALE      = ctx.logit_scaling,
+            num_warps      = 8,
+        )
+        return logits, None, None, None,
+    pass
+pass
+@torch._disable_dynamo
+def fast_cross_entropy_loss(
+    logits,
+    labels,
+    logit_softcapping = 0,
+    logit_scaling = 0,
+    n_items = None,
+):
+    """
+    Arguments:
+        logits: (batch, seq_len, vocab_size)
+        labels: (batch, seq_len,)
+    Returns:
+        losses: float
+    """
+    batch, seq_len, d = logits.shape
+    assert(labels.shape == (batch, seq_len))
+    loss = Fast_CrossEntropyLoss.apply(
+        logits.view(batch*seq_len, d),
+        labels.view(-1),
+        logit_softcapping,
+        logit_scaling,
+    )
+    if n_items is None:
+        n_items = torch.count_nonzero(labels != -100)
+    return loss.sum() / n_items
+pass
+from transformers.models.llama.modeling_llama import (
+    LlamaForCausalLM,
+    CausalLMOutputWithPast,
+    Optional,
+    Union,
+    Cache,
+    List,
+    Tuple,
+)
+# Transformers 4.47 need Unpack, KwargsForCausalLM
+try:
+    from transformers.models.llama.modeling_llama import Unpack, KwargsForCausalLM
+except:
+    pass
+pass
+import inspect, re
+function = inspect.getsource(LlamaForCausalLM.forward)
+function = function.split("\n")
+i = re.match(r"[ ]{1,}", function[0]).span(0)[1]
+function = [x[i:] for x in function]
+function = "\n".join(function)
+function = function[function.find("def forward"):]
+replacement = """    loss = None
+    logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
+    logit_scaling     = getattr(self.config, "logit_scale", 0)
+    if labels is not None:
+        shift_logits = logits
+        if not hasattr(self, "extra_ignored_labels"):
+            # Fixes https://github.com/unslothai/unsloth/issues/10
+            self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda:0")
+        pass
+        shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
+        loss = fast_cross_entropy_loss(
+            logits = shift_logits,
+            labels = shift_labels,
+            logit_softcapping = logit_softcapping,
+            logit_scaling     = logit_scaling,
+            n_items           = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None),
+        )
+    else:
+        if logit_scaling != 0:
+            if logits.requires_grad:
+                logits = logit_scaling * logits
+            else:
+                logits *= logit_scaling
+            pass
+        pass
+        if logit_softcapping != 0:
+            if logits.requires_grad:
+                logits = (1.0 / logit_softcapping) * logits
+                logits = torch.tanh(logits)
+                logits = logit_softcapping * logits
+            else:
+                logits *= (1.0 / logit_softcapping)
+                torch.tanh(logits, out = logits)
+                logits *= logit_softcapping
+            pass
+        pass
+    pass
+"""
+function = \
+    function[:function.find("    loss = None")] + \
+    replacement + \
+    function[ function.find("    if not return_dict"):]
+function = function.replace("logits = logits.float()", "\n")
+# Missed spaces
+function = function.split("\n")
+# Not the first one though!
+function = [function[0]] + [" "*4 + x for x in function[1:]]
+function = "\n".join(function)
+function = f"class Unsloth_LlamaForCausalLM(LlamaForCausalLM):\n"\
+f"    {function}\n"
+exec(function, globals())
+del function, replacement, inspect, re
+def patch_llama_for_causal_lm():
+    import transformers.models.llama.modeling_llama
+    transformers.models.llama.modeling_llama.LlamaForCausalLM = Unsloth_LlamaForCausalLM
+    return
+pass
+def unpatch_llama_for_causal_lm():
+    import transformers.models.llama.modeling_llama
+    transformers.models.llama.modeling_llama.LlamaForCausalLM = LlamaForCausalLM
+    return
+pass
--- a/unsloth/unsloth/kernels/fast_lora.py
+++ b/unsloth/unsloth/kernels/fast_lora.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from .utils import (
+    fast_dequantize,
+    QUANT_STATE,
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    matmul_lora,
+    torch_amp_custom_fwd,
+    torch_amp_custom_bwd,
+)
+class LoRA_MLP(torch.autograd.Function):
+    """
+    ### LoRA weights
+    G = G + Ag @ Bg
+    U = U + Au @ Bu
+    W = W + Aw @ Bw
+    ### SwiGLU(X)
+    e = X @ G
+    f = e * sigmoid(e)
+    g = X @ U
+    h = f * g
+    i = h @ W
+    ### Backpropagation chain rule
+    See our blog post for more details
+    df = sigmoid(e) * (1 - f) + f
+    dC/dW = h.T @ dY
+    dC/dU = X.T @ (D @ W.T * f)
+    dC/dG = X.T @ (D @ W.T * df * g)
+    ### Down projection LoRA weights
+    dC/dAw = dC/dW @ B.T
+    dC/dBw = A.T @ dC/dW
+    dC/dAw =       h.T @ dY @ B.T
+    dC/dBw = A.T @ h.T @ dY
+    ### Up projection LoRA weights
+    dC/dAu =       X.T @ (D @ W.T * f) @ B.T
+    dC/dBu = A.T @ X.T @ (D @ W.T * f)
+    ### Gate projection LoRA weights
+    dC/dAg =       X.T @ (D @ W.T * df * g) @ B.T
+    dC/dBg = A.T @ X.T @ (D @ W.T * df * g)
+    Don't forget to see our blog post for more details!
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                gateW, gateW_quant, gateA, gateB, gateS,
+                  upW,   upW_quant, upA,   upB,   upS,
+                downW, downW_quant, downA, downB, downS,
+                _forward_function, _backward_function,
+                inplace = True,):
+        dtype = X.dtype
+        e = matmul_lora(X, gateW, gateW_quant, gateA, gateB, gateS)
+        g = matmul_lora(X,   upW,   upW_quant,   upA,   upB,   upS)
+        h = _forward_function(e, g)
+        i = matmul_lora(h, downW, downW_quant, downA, downB, downS)
+        ctx.custom_saved_tensors = (
+            gateW, gateW_quant, gateS,
+            upW, upW_quant, upS,
+            downW, downW_quant, downS,
+            _backward_function,
+        )
+        ctx.save_for_backward(gateA, gateB, upA, upB, downA, downB,
+                              X, e, g)
+        ctx.inplace = inplace
+        return i
+    pass
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dY : torch.Tensor):
+        gateW, gateW_quant, gateS, upW, upW_quant, upS, downW, downW_quant, downS, \
+            _backward_function = ctx.custom_saved_tensors
+        gateA, gateB, upA, upB, downA, downB, \
+            X, e, g = ctx.saved_tensors
+        gateA, gateB, upA, upB, downA, downB = \
+            gateA.t(), gateB.t(), upA.t(), upB.t(), downA.t(), downB.t()
+        batch, seq_len, hd = X.shape
+        dY = dY.view(-1, dY.shape[-1])
+        X  = X .view(-1, X .shape[-1])
+        e  = e .view(-1, e .shape[-1])
+        g  = g .view(-1, g .shape[-1])
+        dtype = X.dtype
+        DW = matmul_lora(dY, downW.t(), downW_quant, downB, downA, downS)
+        DW, e, g = _backward_function(DW, e, g)
+        h, df, de = DW, e, g
+        # Down projection LoRA weights
+        d_downA = h.t() @ (dY @ downB.t())
+        d_downB = (downA.t() @ h.t()) @ dY
+        d_downA *= downS
+        d_downB *= downS
+        # Up projection LoRA weights
+        d_upA   = X.t() @ (df @ upB.t())
+        d_upB   = (upA.t() @ X.t()) @ df
+        d_upA  *= upS
+        d_upB  *= upS
+        # Gate projection LoRA weights
+        d_gateA = X.t() @ (de @ gateB.t())
+        d_gateB = (gateA.t() @ X.t()) @ de
+        d_gateA *= gateS
+        d_gateB *= gateS
+        # dX  = matmul_lora(df, upW.t(), upW_quant, upB, upA, upS)
+        # dX += matmul_lora(de, gateW.t(), gateW_quant, gateB, gateA, gateS)
+        upW = fast_dequantize(upW.t(), upW_quant)
+        dX = torch.matmul(df, upW.t(), out = X if ctx.inplace else None)
+        del upW
+        dX += df @ upB.to(dtype).t() @ (upS * upA.to(dtype).t())
+        gateW = fast_dequantize(gateW.t(), gateW_quant)
+        dX += de @ gateW.t()
+        del gateW
+        dX += de @ gateB.to(dtype).t() @ (gateS * gateA.to(dtype).t())
+        # gateW, gateW_quant, gateA, gateB, gateS,
+        #  upW,    upW_quant,   upA,   upB,   upS,
+        # downW, downW_quant, downA, downB, downS,
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_gateA.t(), d_gateB.t(), None, \
+            None, None,   d_upA.t(),   d_upB.t(), None, \
+            None, None, d_downA.t(), d_downB.t(), None, \
+            None, None, None, # _backward and _forward and inplace
+    pass
+pass
+from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
+def apply_lora_mlp_swiglu(self, X, inplace = True):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel,
+                         inplace,)
+    return out
+pass
+from .geglu import geglu_exact_forward_kernel, geglu_exact_backward_kernel
+def apply_lora_mlp_geglu_exact(self, X, inplace = True):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         geglu_exact_forward_kernel, geglu_exact_backward_kernel,
+                         inplace,)
+    return out
+pass
+from .geglu import geglu_approx_forward_kernel, geglu_approx_backward_kernel
+def apply_lora_mlp_geglu_approx(self, X):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         geglu_approx_forward_kernel, geglu_approx_backward_kernel,)
+    return out
+pass
+class LoRA_QKV(torch.autograd.Function):
+    """
+    ### LoRA weights
+    Wq = Wq + Aq @ Bq
+    Wk = Wk + Ak @ Bk
+    Wv = Wv + Av @ Bv
+    Q = X @ Wq = X @ Wq + X @ Aq @ Bq
+    K = X @ Wk = X @ Wk + X @ Ak @ Bk
+    V = X @ Wv = X @ Wv + X @ Av @ Bv
+    ### Backpropagation chain rule
+    See our blogpost for more details.
+    dC/dWq = X.T @ D(Wq)
+    dC/dWk = X.T @ D(Wk)
+    dC/dWv = X.T @ D(Wv)
+    We then sum them all find dC/dX
+    ### Q projection LoRA weights
+    dC/dAq =       X.T @ D(Wq) @ B.T
+    dC/dBq = A.T @ X.T @ D(Wq)
+    ### K projection LoRA weights
+    dC/dAk =       X.T @ D(Wk) @ B.T
+    dC/dBk = A.T @ X.T @ D(Wk)
+    ### V projection LoRA weights
+    dC/dAv =       X.T @ D(Wv) @ B.T
+    dC/dBv = A.T @ X.T @ D(Wv)
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                QW, QW_quant, QA, QB, QS,
+                KW, KW_quant, KA, KB, KS,
+                VW, VW_quant, VA, VB, VS,
+                inplace = True):
+        dtype = X.dtype
+        Q = matmul_lora(X, QW, QW_quant, QA, QB, QS)
+        K = matmul_lora(X, KW, KW_quant, KA, KB, KS)
+        V = matmul_lora(X, VW, VW_quant, VA, VB, VS)
+        ctx.custom_saved_tensors = (
+            QW, QW_quant, QS,
+            KW, KW_quant, KS,
+            VW, VW_quant, VS,
+        )
+        ctx.save_for_backward(X, QA, QB, KA, KB, VA, VB,)
+        ctx.inplace = inplace
+        return Q, K, V
+    pass
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dQ, dK, dV):
+        QW, QW_quant, QS, KW, KW_quant, KS, VW, VW_quant, VS = \
+            ctx.custom_saved_tensors
+        X, QA, QB, KA, KB, VA, VB, = ctx.saved_tensors
+        QA, QB, KA, KB, VA, VB = \
+            QA.t(), QB.t(), KA.t(), KB.t(), VA.t(), VB.t()
+        batch, seq_len, hd = X.shape
+        dQ = dQ.view(-1, dQ.shape[-1])
+        dK = dK.reshape(-1, dK.shape[-1]) # view doesn't work on K.T
+        dV = dV.view(-1, dV.shape[-1])
+        X  = X .view(-1, X .shape[-1])
+        dtype = X.dtype
+        ### Weight projection LoRA weights
+        # See our blogpost for more details.
+        # Q Projection
+        d_QA = X.t() @ (dQ @ QB.t())
+        d_QB = (QA.t() @ X.t()) @ dQ
+        d_QA *= QS
+        d_QB *= QS
+        # K Projection
+        d_KA = X.t() @ (dK @ KB.t())
+        d_KB = (KA.t() @ X.t()) @ dK
+        d_KA *= KS
+        d_KB *= KS
+        # V Projection
+        d_VA = X.t() @ (dV @ VB.t())
+        d_VB = (VA.t() @ X.t()) @ dV
+        d_VA *= VS
+        d_VB *= VS
+        # Combine derivatives to find dX
+        # dQ
+        QW = fast_dequantize(QW.t(), QW_quant)
+        dX = torch.matmul(dQ, QW.t(), out = X if ctx.inplace else None)
+        del QW
+        dX += (dQ @ QB.to(dtype).t() @ (QS * QA.to(dtype).t()))
+        # dK
+        KW = fast_dequantize(KW.t(), KW_quant)
+        dX += dK @ KW.t()
+        del KW
+        dX += dK @ KB.to(dtype).t() @ (KS * KA.to(dtype).t())
+        # dV
+        VW = fast_dequantize(VW.t(), VW_quant)
+        dX += dV @ VW.t()
+        del VW
+        dX += dV @ VB.to(dtype).t() @ (VS * VA.to(dtype).t())
+        # QW, QW_quant, QA, QB, QS,
+        # KW, KW_quant, KA, KB, KS,
+        # VW, VW_quant, VA, VB, VS,
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_QA.t(), d_QB.t(), None, \
+            None, None, d_KA.t(), d_KB.t(), None, \
+            None, None, d_VA.t(), d_VB.t(), None, \
+            None,
+    pass
+pass
+def apply_lora_qkv(self, X, inplace = True):
+    QW, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
+    KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
+    VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
+    Q, K, V = LoRA_QKV.apply(X,
+        QW, QW_quant, QA, QB, QS,
+        KW, KW_quant, KA, KB, KS,
+        VW, VW_quant, VA, VB, VS,
+        inplace,
+    )
+    return Q, K, V
+pass
+class LoRA_W(torch.autograd.Function):
+    """
+    ### LoRA weights
+    Wq = Wq + Aq @ Bq
+    Wk = Wk + Ak @ Bk
+    Wv = Wv + Av @ Bv
+    Q = X @ Wq = X @ Wq + X @ Aq @ Bq
+    K = X @ Wk = X @ Wk + X @ Ak @ Bk
+    V = X @ Wv = X @ Wv + X @ Av @ Bv
+    ### Backpropagation chain rule
+    dC/dWq = X.T @ D(Wq)
+    dC/dWk = X.T @ D(Wk)
+    dC/dWv = X.T @ D(Wv)
+    ### Q projection LoRA weights
+    dC/dAq =       X.T @ D(Wq) @ B.T
+    dC/dBq = A.T @ X.T @ D(Wq)
+    ### K projection LoRA weights
+    dC/dAk =       X.T @ D(Wk) @ B.T
+    dC/dBk = A.T @ X.T @ D(Wk)
+    ### V projection LoRA weights
+    dC/dAv =       X.T @ D(Wv) @ B.T
+    dC/dBv = A.T @ X.T @ D(Wv)
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                W, W_quant, A, B, S):
+        dtype = X.dtype
+        XW = matmul_lora(X, W, W_quant, A, B, S)
+        ctx.custom_saved_tensors = (W, W_quant, S,)
+        ctx.save_for_backward(A, B, X)
+        return XW
+    pass
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dY : torch.Tensor):
+        W, W_quant, S = ctx.custom_saved_tensors
+        A, B, X = ctx.saved_tensors
+        A, B = A.t(), B.t()
+        batch, seq_len, hd = X.shape
+        dY = dY.reshape(-1, dY.shape[-1]) # Must be reshape
+        X  = X .reshape(-1, X .shape[-1]) # Must be reshape
+        dtype = X.dtype
+        ### Weight projection LoRA weights
+        # Weight projection
+        d_A = X.t() @ (dY @ B.t())
+        d_B = (A.t() @ X.t()) @ dY
+        d_A *= S
+        d_B *= S
+        # Get derivative for dX
+        W = fast_dequantize(W.t(), W_quant)
+        dX = dY @ W.t()
+        del W
+        dX += dY @ B.to(dtype).t() @ (S * A.to(dtype).t())
+        # W, W_quant, A, B, S
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_A.t(), d_B.t(), None
+    pass
+pass
+def apply_lora_o(self, X):
+    OW, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
+    O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS)
+    return O
+pass
--- a/unsloth/unsloth/kernels/flex_attention.py
+++ b/unsloth/unsloth/kernels/flex_attention.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from functools import lru_cache
+from transformers.models.llama.modeling_llama import logger
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : True,
+    "shape_padding"     : True,
+    "trace.enabled"     : False, # Output Triton kernel outputs!
+    "triton.cudagraphs" : False,
+}
+# Flex Attention supported from torch 2.5 onwards only
+try:
+    from torch.nn.attention.flex_attention import (
+        flex_attention as _flex_attention,
+        create_block_mask as _create_block_mask,
+    )
+    _flex_attention = torch.compile(_flex_attention, dynamic = True, options = torch_compile_options)
+    HAS_FLEX_ATTENTION = False
+except:
+    HAS_FLEX_ATTENTION = False
+pass
+if not HAS_FLEX_ATTENTION:
+    # Logit softcapping
+    @torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.num_heads
+        head_dim   = self.head_dim
+        n_kv_heads = self.num_key_value_heads
+        n_groups   = self.num_key_value_groups
+        # Grouped query attention
+        K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        K = K.reshape(bsz, n_heads, q_len, head_dim)
+        V = V.reshape(bsz, n_heads, q_len, head_dim)
+        # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+        # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+        # We default to using the config file itself
+        # s = self.config.hidden_size // self.config.num_attention_heads
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+        Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+        A = torch.matmul(Q, K.transpose(2, 3))
+        A = t * torch.tanh(A / t) # Logit softcapping
+        A += causal_mask[:q_len, :q_len]
+        # Much slower in torch compile!
+        # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+        A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+        A = torch.matmul(A, V)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+    create_flex_attention_causal_mask = None
+    create_flex_attention_sliding_window_mask = None
+else:
+    # See https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
+    # for more examples
+    # BSD 3-Clause License Copyright (c) 2023, Driss Guessous, Horace He et al
+    import functools, math
+    def generate_tanh_softcap(t):
+        def tanh_softcap(x, b, h, q_idx, kv_idx):
+            return t * torch.tanh(x / t)
+        return tanh_softcap
+    pass
+    def causal_masker(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+    pass
+    @functools.lru_cache
+    def sliding_window_masker(size = 4096):
+        def sliding_window(b, h, q_idx, kv_idx):
+            causal_mask = q_idx >= kv_idx
+            window_mask = q_idx - kv_idx <= size 
+            return causal_mask & window_mask
+        return sliding_window
+    pass
+    @functools.lru_cache
+    def create_block_mask(mask, n = 128):
+        return _create_block_mask(
+            mask, 1, 1, n, n,
+            BLOCK_SIZE = 128,
+            _compile = True,
+        )
+    pass
+    def create_flex_attention_causal_mask(max_seq_length = 8192):
+        causal_mask = create_block_mask(causal_masker, max_seq_length)
+        return causal_mask
+    pass
+    def create_flex_attention_sliding_window_mask(max_seq_length = 8192, sliding_window = 4096):
+        sliding_masker = sliding_window_masker(sliding_window)
+        causal_mask = create_block_mask(sliding_masker, max_seq_length)
+        return causal_mask
+    pass
+    @functools.lru_cache
+    def flex_attention(s, t):
+        scale = 1.0 / math.sqrt(s)
+        score_mod = generate_tanh_softcap(t)
+        return functools.partial(
+            _flex_attention, score_mod = score_mod, scale = scale, enable_gqa = True,
+        )
+    pass
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.num_heads
+        head_dim   = self.head_dim
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+        fx = flex_attention(s, t)
+        A = fx(query = Q, key = K, value = V, block_mask = causal_mask)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+pass
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
+torch_nn_functional_softmax = torch.nn.functional.softmax
+def slow_inference_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+    n_heads    = self.num_heads
+    head_dim   = self.head_dim
+    n_kv_heads = self.num_key_value_heads
+    n_groups   = self.num_key_value_groups
+    # Grouped query attention
+    K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+    V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+    K = K.reshape(bsz, n_heads, q_len, head_dim)
+    V = V.reshape(bsz, n_heads, q_len, head_dim)
+    # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+    # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+    # We default to using the config file itself
+    # s = self.config.hidden_size // self.config.num_attention_heads
+    s = self.config.query_pre_attn_scalar
+    t = self.config.attn_logit_softcapping
+    Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+    A = torch_matmul(Q, K.transpose(2, 3))
+    # Logit softcapping
+    A /= t; torch_tanh(A, out = A); A *= t;
+    A += causal_mask[:q_len, :q_len]
+    # Much slower in torch compile!
+    # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+    A = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+    A = torch_matmul(A, V)
+    A = A.transpose(1, 2).contiguous()
+    A = A.reshape(bsz, q_len, n_heads*head_dim)
+    return A
+pass
--- a/unsloth/unsloth/kernels/geglu.py
+++ b/unsloth/unsloth/kernels/geglu.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, triton_tanh
+@triton.jit
+def _exact_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    # h = f * up
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+    f_row = 0.5 * e_row * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    h_row = f_row * g_row
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+def geglu_exact_forward_kernel(gate, up):
+    batch, seq_len, hd = gate.shape
+    n_elements = gate.numel()
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    return out
+pass
+@triton.jit
+def _exact_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    h = f * up
+    df/de (with help of Wolfram :)
+    df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
+    Reuse via
+    f =        1/2 * (1 + erf(1/sqrt(2) * e)) * e
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+    # Break e_row away for re-use
+    # f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    f_partial_row = 0.5 * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
+    f_row = f_partial_row * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+    # df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
+    t = 0.3989422804014327 # 1/sqrt(2*pi)
+    df_de = f_partial_row + t * e_row * tl.exp(-0.5 * e_row * e_row)
+    de_row = dg_row.to(tl.float32) * df_de
+    de_row = de_row.to(DW_row.dtype)
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+def geglu_exact_backward_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _exact_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass
+@triton.jit
+def _approx_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
+    # h = f * up
+    s = 0.7978845608028654 # math.sqrt(2 / math.pi)
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+    f_row = 0.5 * e_row * (
+        triton_tanh(s * e_row * (1.0 + 0.044715 * e_row * e_row)) \
+        + 1.0
+    )
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    h_row = f_row * g_row
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+def geglu_approx_forward_kernel(gate, up):
+    batch, seq_len, hd = gate.shape
+    n_elements = gate.numel()
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    return out
+pass
+@triton.jit
+def _approx_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
+    h = f * up
+    df/de (with help from https://arxiv.org/pdf/2305.12073.pdf :))
+    df/de = 1/2 * [1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )] +
+            1/2 * sech^2 [   sqrt(2/pi) * x * (1 + 0.044715 * x^2 )  ] * \
+                           ( sqrt(2/pi) * x * (1 + 0.044715 * x^2 * 3 ) )
+    Notice sech^2(x) = 1 - tanh^2(x)
+    So reuse tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )
+    See https://www.desmos.com/calculator/nqprfoni6x
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+    # See https://www.desmos.com/calculator/nqprfoni6x
+    s = 0.7978845608028654 # math.sqrt(2 / math.pi)
+    a = s * e_row # a = sqrt(2 / pi) * x
+    b = a * 0.044715 * e_row * e_row # b = a * 0.044715 * x^2
+    T = 1.0 + triton_tanh(a + b)
+    T2 = 0.5 * T
+    # Q = 0.5 * -T * (T - 2.0) * (a + 3.0 * b)
+    Q2 = -T2 * (T - 2.0) * (a + 3.0 * b) 
+    df_de = T2 + Q2 # 1/2 * (T + Q)
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
+    f_row = T2 * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+    de_row = dg_row.to(tl.float32) * df_de
+    de_row = de_row.to(DW_row.dtype)
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+def geglu_approx_backward_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    _approx_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass
--- a/unsloth/unsloth/kernels/layernorm.py
+++ b/unsloth/unsloth/kernels/layernorm.py
--- a/unsloth/unsloth/kernels/rms_layernorm.py
+++ b/unsloth/unsloth/kernels/rms_layernorm.py
--- a/unsloth/unsloth/kernels/rope_embedding.py
+++ b/unsloth/unsloth/kernels/rope_embedding.py