v1.0.3

d3e0fa63 · chenzk · d3e0fa63 · d3e0fa63 · d3e0fa63 · d3e0fa63
Commit d3e0fa63 authored Jul 31, 2024 by chenzk
4 changed files
--- a/unsloth/unsloth/models/qwen2.py
+++ b/unsloth/unsloth/models/qwen2.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from .llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+)
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2DecoderLayer,
+    Qwen2Model,
+    Qwen2ForCausalLM,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.qwen2.modeling_qwen2 import (
+        Qwen2SdpaAttention,
+        Qwen2FlashAttention2,
+    )
+except:
+    Qwen2SdpaAttention   = Qwen2Attention
+    Qwen2FlashAttention2 = Qwen2Attention
+pass
+
+
+class FastQwen2Model(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "qwen2",
+            rope_module        = LlamaRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = Qwen2Attention,
+        )
+        if init_name is not None:
+            exec(function, globals())
+            Qwen2Attention.__init__  = eval(init_name)
+        pass
+        Qwen2Attention      .forward = LlamaAttention_fast_forward
+        Qwen2SdpaAttention  .forward = LlamaAttention_fast_forward
+        Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
+        Qwen2DecoderLayer   .forward = LlamaDecoderLayer_fast_forward
+        Qwen2Model          .forward = LlamaModel_fast_forward
+        Qwen2ForCausalLM    .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+        fix_prepare_inputs_for_generation(Qwen2ForCausalLM)
+
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.qwen2.modeling_qwen2
+        transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def from_pretrained(
+        model_name        = "Qwen/Qwen2-7B",
+        max_seq_length    = 4096,
+        dtype             = None,
+        load_in_4bit      = True,
+        token             = None,
+        device_map        = "sequential",
+        rope_scaling      = None, # Qwen2 does not support RoPE scaling
+        fix_tokenizer     = True,
+        model_patcher     = None,
+        tokenizer_name    = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        return FastLlamaModel.from_pretrained(
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = FastQwen2Model,
+            tokenizer_name    = tokenizer_name,
+            trust_remote_code = trust_remote_code,
+            **kwargs,
+        )
+    pass
+pass
--- a/unsloth/unsloth/save.py
+++ b/unsloth/unsloth/save.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
+from peft.tuners.lora import Linear4bit as Peft_Linear4bit
+from peft.tuners.lora import Linear as Peft_Linear
+from typing import Optional, Callable, Union, List
+import torch
+import os
+import shutil
+import pickle
+import gc
+from transformers.models.llama.modeling_llama import logger
+from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias
+import subprocess
+import psutil
+import re
+from transformers.models.llama.modeling_llama import logger
+from .tokenizer_utils import fix_sentencepiece_gguf
+
+__all__ = [
+    "print_quantization_methods",
+    "unsloth_save_model",
+    "save_to_gguf",
+    "patch_saving_functions",
+]
+
+# Check environments
+keynames = "\n" + "\n".join(os.environ.keys())
+IS_COLAB_ENVIRONMENT  = "\nCOLAB_"  in keynames
+IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
+del keynames
+
+# Weights
+LLAMA_WEIGHTS = (
+    "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj",
+    "mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
+)
+LLAMA_LAYERNORMS = (
+    "input_layernorm", "post_attention_layernorm",
+    "pre_feedforward_layernorm", "post_feedforward_layernorm",
+)
+
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
+# From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html
+ALLOWED_QUANTS = \
+{
+    "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+    "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+    "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+    "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+    "bf16"    : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "f16"     : "Float16  - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+    "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+    "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+    "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+    "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+    "q3_k_s"  : "Uses Q3_K for all tensors",
+    "q4_0"    : "Original quant method, 4-bit.",
+    "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+    "q4_k_s"  : "Uses Q4_K for all tensors",
+    "q4_k"    : "alias for q4_k_m",
+    "q5_k"    : "alias for q5_k_m",
+    "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+    "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+    "q5_k_s"  : "Uses Q5_K for all tensors",
+    "q6_k"    : "Uses Q8_K for all tensors",
+    # "iq2_xxs" : "2.06 bpw quantization", # Not supported sadly
+    # "iq2_xs"  : "2.31 bpw quantization",
+    # "iq3_xxs" : "3.06 bpw quantization",
+    "q3_k_xs" : "3-bit extra small quantization",
+}
+
+def print_quantization_methods():
+    for key, value in ALLOWED_QUANTS.items():
+        print(f'"{key}"  ==> {value}')
+    pass
+pass
+
+
+def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"):
+    if not hasattr(model, "_saved_temp_tokenizer"): return False
+
+    temp_tokenizer = model._saved_temp_tokenizer
+    sentencepiece_model = False
+    file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path)
+    if not os.path.exists(file_location):
+        os.makedirs(file_location)
+    pass
+    temp_tokenizer.save_pretrained(file_location)
+    if os.path.isfile(f"{file_location}/tokenizer.model"):
+        sentencepiece_model = True
+    pass
+    shutil.rmtree(file_location, ignore_errors = True)
+    return sentencepiece_model
+pass
+
+
+def _free_cached_model(model):
+    from huggingface_hub import scan_cache_dir
+    cached_repos = list(scan_cache_dir().repos)
+
+    # Go through every cached repo, and delete the one that matches the model we want to save.
+    # Can save 4GB of disk space - useful for Kaggle systems.
+    for cached_repo in cached_repos:
+        if cached_repo.repo_id == model.config._name_or_path:
+            remove_cache_commit = list(cached_repo.revisions)[0].commit_hash
+            delete_strategy = scan_cache_dir().delete_revisions(remove_cache_commit,)
+
+            logger.warning_once(
+                "Unsloth: Will remove a cached repo with size " + \
+                delete_strategy.expected_freed_size_str,
+            )
+
+            delete_strategy.execute()
+        pass
+    pass
+pass
+
+
+def _merge_lora(layer, name):
+
+    bias = None
+    if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)):
+        # Is LoRA so we need to merge!
+        W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer)
+        if quant_state is not None:
+            dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2]
+            W = fast_dequantize(W, quant_state)
+        else:
+            dtype = W.dtype
+        W = W.to(torch.float32).t()
+        # W = W.t()
+
+        if A is not None:
+            # sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32)))
+            # W += sAB
+            W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s)
+            # W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s)
+            # if not torch.isfinite(W).all():
+            maximum_element = torch.max(W.min().abs(), W.max())
+            if not torch.isfinite(maximum_element).item():
+                raise ValueError(f"Unsloth: Merge failed.\n{name} has some elements = infinity.")
+        pass
+        W = W.t().to(dtype)
+    else:
+        W = layer.weight
+    return W, bias
+pass
+
+
+def fast_save_pickle(shard, name):
+    # Use this if # CPUs is <= 2
+    print(f"Unsloth: Saving {name}...")
+    torch.save(
+        shard,
+        name,
+        # HIGHEST_PROTOCOL seems to not work with Pytorch!
+        # pickle_module   = pickle,
+        # pickle_protocol = pickle.HIGHEST_PROTOCOL,
+    )
+    return
+pass
+
+
+@torch.inference_mode
+def unsloth_save_model(
+    model,
+    tokenizer,
+    save_directory       : Union[str, os.PathLike],
+    save_method          : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+
+    # Push to hub
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    create_pr            : bool = False,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : List[str] = None,
+
+    # Our functions
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.9,
+):
+    if token is None and "HF_TOKEN" in os.environ:
+        token = os.environ["HF_TOKEN"]
+
+    if token is None and "HUGGINGFACE_TOKEN" in os.environ:
+        token = os.environ["HUGGINGFACE_TOKEN"]
+
+    if commit_message is None: commit_message = ""
+    if "Unsloth" not in commit_message:
+        commit_message += " (Trained with Unsloth)"
+    commit_message = commit_message.lstrip()
+
+    if commit_description is None:
+        commit_description = "Upload model trained with Unsloth 2x faster"
+    elif "Unsloth 2x faster" not in commit_description:
+        commit_description += " (Trained with Unsloth 2x faster)"
+    pass
+
+    if save_method == "merged_4bit":
+        raise RuntimeError(
+            "Unsloth: Merging into 4bit will cause your model to lose accuracy if you plan\n"\
+            "to merge to GGUF or others later on. I suggest you to do this as a final step\n"\
+            "if you're planning to do multiple saves.\n"\
+            "If you are certain, change `save_method` to `merged_4bit_forced`."
+        )
+    elif save_method == "merged_4bit_forced":
+        save_method = "merged_4bit"
+    pass
+
+    save_pretrained_settings = dict(locals())
+    for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
+        del save_pretrained_settings[deletion]
+    pass
+
+    # First check for a token!
+    if push_to_hub:
+        from huggingface_hub import whoami
+        try: 
+            username = whoami(token = token)["name"]
+        except:
+            raise RuntimeError(
+                "Unsloth: Please supply a token!\n"\
+                "Go to https://huggingface.co/settings/tokens"
+            )
+        pass
+    pass
+
+    assert(maximum_memory_usage > 0 and maximum_memory_usage <= 0.95)
+
+    # Clean memory up first
+    for _ in range(3):
+        torch.cuda.empty_cache()
+        gc.collect()
+    pass
+
+    save_method = save_method.lower().replace(" ", "_")
+    if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
+        raise RuntimeError(
+            "Unsloth: You must select one of 3 options when saving models:\n"\
+            '"lora"         ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
+            '"merged_16bit" ==> This merges LoRA weights and saves to float16. Needed for llama.cpp / GGUF.\n'\
+            '"merged_4bit"  ==> This merges LoRA weights and saves to 4bit. Useful for DPO / inference.'
+        )
+    pass
+
+    if save_method == "merged_4bit":
+
+        print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
+        print("This might take 5 minutes...")
+
+        # Counteract no LoRA adapters!
+        if hasattr(model, "merge_and_unload"):
+            model = model.merge_and_unload()
+        pass
+        print("Done.")
+    pass
+
+    if tags is not None:
+        assert(isinstance(tags, (list, tuple)))
+        tags = list(tags) + ["unsloth",]
+    else:
+        tags = ["unsloth",]
+    pass
+    save_pretrained_settings["tags"] = tags
+
+    if ((save_method == "lora") or (save_method == "merged_4bit")) and push_to_hub:
+        if token is None:
+            raise RuntimeError(
+                "Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
+                "Go to https://huggingface.co/settings/tokens."
+            )
+        pass
+
+        if save_method == "lora":
+            print("Unsloth: Saving LoRA adapters. Please wait...")
+        elif save_method == "merged_4bit":
+            print("Unsloth: Saving 4bit Bitsandbytes model. Please wait...")
+        pass
+
+        # Update model tag
+        _ = upload_to_huggingface(
+            model, save_directory, token,
+            "finetuned", "trl", file_location = None,
+            old_username = None, private = private,
+        )
+
+        getattr(model, "original_push_to_hub", tokenizer.push_to_hub)\
+        (
+            repo_id            = save_directory,
+            use_temp_dir       = use_temp_dir,
+            commit_message     = commit_message,
+            private            = private,
+            token              = token,
+            max_shard_size     = max_shard_size,
+            create_pr          = create_pr,
+            safe_serialization = safe_serialization,
+            revision           = revision,
+            commit_description = commit_description,
+            tags               = tags,
+        )
+        if tokenizer is not None:
+            # Set padding side to left for inference
+            old_padding_side = tokenizer.padding_side
+            tokenizer.padding_side = "left"
+
+            getattr(tokenizer, "original_push_to_hub", tokenizer.push_to_hub)\
+            (
+                repo_id            = save_directory,
+                use_temp_dir       = use_temp_dir,
+                commit_message     = commit_message,
+                private            = private,
+                token              = token,
+                max_shard_size     = max_shard_size,
+                create_pr          = create_pr,
+                safe_serialization = safe_serialization,
+                revision           = revision,
+                commit_description = commit_description,
+                tags               = tags,
+            )
+
+            # Revert back padding side
+            tokenizer.padding_side = old_padding_side
+        pass
+
+        if hasattr(model, "config"):
+            print(f"Saved {save_method} model to https://huggingface.co/" + save_directory)
+        pass
+        return save_directory, None
+    pass
+
+    # Tokenizer has different saving arguments
+    tokenizer_save_settings = \
+    {
+        "save_directory"  : save_pretrained_settings["save_directory"],
+        "legacy_format"   : None,
+        "filename_prefix" : None,
+        "push_to_hub"     : save_pretrained_settings["push_to_hub"],
+        "private"         : save_pretrained_settings["private"],
+        "token"           : save_pretrained_settings["token"],
+    }
+
+    # Check if PEFT Model or not - if yes, 3 levels. If not 2 levels.
+    from peft import PeftModelForCausalLM
+    if isinstance(model, PeftModelForCausalLM):
+        internal_model = model.model
+    else:
+        internal_model = model
+    pass
+        
+    # Cannot be converted properly!
+    if (save_method == "merged_4bit") or (save_method == "lora") or (
+        not hasattr(model, "model") or \
+        not hasattr(internal_model.model, "layers")
+    ):
+        # Do general saving
+        # Edit save_pretrained_settings
+        # [TODO] _create_repo has errors due to **kwargs getting accepted
+        # commit_description does not seem to work?
+        what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
+            if save_pretrained_settings["push_to_hub"] is False else \
+            ("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
+        for deletion in what_to_delete:
+            del save_pretrained_settings[deletion]
+        pass
+        if hasattr(model, "add_model_tags"):
+            model.add_model_tags(["unsloth",])
+
+        # Update model tag
+        if push_to_hub:
+             _ = upload_to_huggingface(
+                model, save_pretrained_settings["save_directory"], token,
+                "finetuned", "trl", file_location = None,
+                old_username = None, private = private,
+            )
+        pass
+
+        if tokenizer is not None:
+            print("Unsloth: Saving tokenizer...", end = "")
+
+            # Set padding side to left for inference
+            old_padding_side = tokenizer.padding_side
+            tokenizer.padding_side = "left"
+
+            tokenizer.save_pretrained(**tokenizer_save_settings)
+
+            # Revert back padding side
+            tokenizer.padding_side = old_padding_side
+
+            print(" Done.")
+        else:
+            print()
+
+        print("Unsloth: Saving model...", end = "")
+        if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
+
+        # [TODO] Is this correct?
+        if save_method == "lora":
+            save_pretrained_settings["selected_adapters"] = None
+        pass
+
+        model.save_pretrained(**save_pretrained_settings)
+
+        if push_to_hub and hasattr(model, "config"):
+            print("Saved to https://huggingface.co/" + save_pretrained_settings["save_directory"])
+        pass
+
+        print(" Done.")
+        return save_directory, None
+    pass
+
+    # If push_to_hub, we must remove the .../ part of a repo
+    username = None
+    if push_to_hub and "/" in save_directory:
+
+        # +1 solves absolute path issues
+        username = save_directory[:save_directory.find("/")]
+        new_save_directory = save_directory[save_directory.find("/")+1:]
+
+        logger.warning_once(
+            f"Unsloth: You are pushing to hub, but you passed your HF username = {username}.\n"\
+            f"We shall truncate {save_directory} to {new_save_directory}"
+        )
+
+        save_pretrained_settings["save_directory"] = new_save_directory
+        tokenizer_save_settings ["save_directory"] = new_save_directory
+        save_directory = new_save_directory
+    pass
+
+    print("Unsloth: Merging 4bit and LoRA weights to 16bit...")
+
+    # Determine max RAM usage minus sharding
+    max_ram = psutil.virtual_memory().available
+    sharded_ram_usage = 5 * 1024 * 1024 * 1024
+    if type(max_shard_size) is str:
+        gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
+        mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
+        if   gb_found: sharded_ram_usage = int(gb_found.group(1)) * 1024 * 1024 * 1024
+        elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024 
+    elif type(max_shard_size) is int:
+        sharded_ram_usage = sharded_ram_usage
+    pass
+
+    # Switch to our fast saving modules if it's a slow PC!
+    n_cpus = psutil.cpu_count(logical = False)
+    if n_cpus is None: n_cpus = psutil.cpu_count()
+    if n_cpus is None: n_cpus = 1
+
+    if safe_serialization is None:
+        safe_serialization = True
+        save_pretrained_settings["safe_serialization"] = safe_serialization
+
+    elif safe_serialization and (n_cpus <= 2):
+        logger.warning_once(
+            f"Unsloth: You have {n_cpus} CPUs. Using `safe_serialization` is 10x slower.\n"\
+            f"We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.\n"\
+            f"To force `safe_serialization`, set it to `None` instead.",
+        )
+        safe_serialization = False
+        save_function = fast_save_pickle
+        save_pretrained_settings["safe_serialization"] = safe_serialization
+        save_pretrained_settings["save_function"]      = save_function
+    pass
+
+    # Only safe_serialization uses more RAM
+    if safe_serialization:
+        max_ram -= sharded_ram_usage
+    else:
+        max_ram -= sharded_ram_usage*0.25 # Uses much less
+    pass
+
+    max_ram = int(max(0, max_ram) * maximum_memory_usage)
+    print(f"Unsloth: Will use up to "\
+          f"{round(max_ram/1024/1024/1024, 2)} out of "\
+          f"{round(psutil.virtual_memory().total/1024/1024/1024, 2)} RAM for saving.")
+
+    # Max directory for disk saving
+    if not os.path.exists(temporary_location):
+        os.makedirs(temporary_location)
+    pass
+
+    # Check if Kaggle or Colab, since only 20GB of Disk space allowed.
+    if IS_KAGGLE_ENVIRONMENT or IS_COLAB_ENVIRONMENT:
+        # We free up 4GB of space
+        logger.warning_once(
+            "Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n"\
+            "model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab."
+        )
+        _free_cached_model(internal_model)
+    pass
+
+    # HF also uses a OrderedDict
+    from collections import OrderedDict
+    state_dict = OrderedDict()
+
+    torch_dtype = internal_model.config.torch_dtype
+    if type(torch_dtype) is str:
+        if   torch_dtype ==  "float16": torch_dtype = torch.float16
+        elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16
+    pass
+
+    # Check modules to save float32 dtype
+    state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype)
+
+    max_vram = int(torch.cuda.get_device_properties(0).total_memory * maximum_memory_usage)
+
+    from tqdm import tqdm as ProgressBar
+    for j, layer in enumerate(ProgressBar(internal_model.model.layers)):
+        for item in LLAMA_WEIGHTS:
+            proj = eval(f"layer.{item}")
+            name = f"model.layers.{j}.{item}.weight"
+            W, bias = _merge_lora(proj, name)
+
+            # Bias term
+            if bias is not None:
+                state_dict[f"model.layers.{j}.{item}.bias"] = bias
+            pass
+
+            if (torch.cuda.memory_allocated() + W.nbytes) < max_vram:
+                # Save to GPU memory
+                state_dict[name] = W
+            # [TODO] Saving to RAM seems to leak memory???
+            # elif (max_ram - W.nbytes) > 0:
+            #     # Save to CPU memory
+            #     logger.warning_once(f"We will save to RAM and not VRAM now.")
+            #     state_dict[name] = W.to("cpu", non_blocking = True, copy = True)
+            #     max_ram = max(max_ram - W.nbytes, 0)
+            else:
+                # Save to Disk
+                logger.warning_once(f"We will save to Disk and not RAM now.")
+                filename = os.path.join(temporary_location, f"{name}.pt")
+                torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
+                state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True)
+        pass
+        for item in LLAMA_LAYERNORMS:
+            try:
+                # Skip for Gemma 2
+                state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data")
+            except:
+                continue
+        pass
+    pass
+
+    state_dict["model.norm.weight"] = internal_model.model.norm.weight.data
+    # Check for modules_to_save float32 dtype
+
+    # Check for tied weights
+    if internal_model.model.embed_tokens.weight.data_ptr() != internal_model.lm_head.weight.data_ptr():
+        state_dict["lm_head.weight"] = internal_model.lm_head.weight.data.to(torch_dtype)
+    pass
+
+    # All tensors MUST be type torch.Tensor and not torch.nn.parameter.Parameter
+    for key, value in state_dict.items():
+        if hasattr(value, "data"): state_dict[key] = value = value.data
+        if type(value) is not torch.Tensor:
+            logger.warning_once(f"Unsloth: {key} is not a Tensor but a {type(value)}.")
+        pass
+    pass
+
+    # Edit save_pretrained_settings
+    # [TODO] _create_repo has errors due to **kwargs getting accepted
+    save_pretrained_settings["state_dict"] = state_dict
+    
+    # commit_description does not seem to work?
+    what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
+        if not push_to_hub else \
+        ("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
+    for deletion in what_to_delete:
+        del save_pretrained_settings[deletion]
+    pass
+    if hasattr(model, "add_model_tags"):
+        model.add_model_tags(["unsloth",])
+
+    # Update model tag
+    if push_to_hub:
+        _ = upload_to_huggingface(
+            model, save_pretrained_settings["save_directory"], token,
+            "finetuned", "trl", file_location = None,
+            old_username = username, private = private,
+        )
+    pass
+
+    # First check if we're pushing to an organization!
+    save_directory = save_pretrained_settings["save_directory"]
+
+    if save_pretrained_settings["push_to_hub"]:
+        new_save_directory, new_username = _determine_username(save_directory, username, token)
+
+        if token is not None:
+            from huggingface_hub import whoami
+            actual_username = whoami(token = token)["name"]
+        else:
+            actual_username = username
+    pass
+
+    # Check if pushing to an organization
+    if save_pretrained_settings["push_to_hub"] and (username != actual_username):
+        print(f"Unsloth: Saving to organization with address {new_save_directory}")
+        # We upload everything at the end!
+        tokenizer_save_settings["push_to_hub"] = False
+        tokenizer_save_settings["save_directory"] = new_save_directory
+    pass
+
+    # Save tokenizer
+    if tokenizer is not None:
+        print("Unsloth: Saving tokenizer...", end = "")
+
+        # Set padding side to left for inference
+        old_padding_side = tokenizer.padding_side
+        tokenizer.padding_side = "left"
+
+        tokenizer.save_pretrained(**tokenizer_save_settings)
+
+        # Revert back padding side
+        tokenizer.padding_side = old_padding_side
+            
+        print(" Done.")
+    else:
+        print()
+    pass
+
+    print("Unsloth: Saving model... This might take 5 minutes for Llama-7b...")
+
+    # Since merged, edit quantization_config
+    old_config = model.config
+    new_config = model.config.to_dict()
+    if "quantization_config" in new_config:
+        del new_config["quantization_config"]
+    original_model = model
+    new_config = type(model.config).from_dict(new_config)
+    while hasattr(original_model, "model"):
+        original_model = original_model.model
+        original_model.config = new_config
+    model.config = new_config
+
+    # Save!
+    # [TODO] --> is this correct?
+    # save_pretrained_settings["selected_adapters"] = None
+
+    # Check if pushing to an organization
+    if save_pretrained_settings["push_to_hub"] and (username != actual_username):
+        print(f"Unsloth: Saving to organization with address {new_save_directory}")
+        # Pushing to organization!
+        # Sadly .save_pretrained doesn't work :(
+        # We first save it via .save_pretrained, then upload manually!
+        save_pretrained_settings["save_directory"] = new_save_directory
+        save_pretrained_settings["push_to_hub"] = False
+        internal_model.save_pretrained(**save_pretrained_settings)
+
+        # Now manually go through each file and upload them manually!
+        filenames = os.listdir(new_save_directory)
+
+        from huggingface_hub import HfApi
+        hf_api = HfApi(token = save_pretrained_settings["token"])
+
+        print("Unsloth: Uploading all files... Please wait...")
+        hf_api.upload_folder(
+            folder_path = new_save_directory,
+            path_in_repo = ".",
+            repo_id = new_save_directory,
+            repo_type = "model",
+            commit_message  = "(Trained with Unsloth)",
+            ignore_patterns = "*.md",
+        )
+    else:
+        internal_model.save_pretrained(**save_pretrained_settings)
+    pass
+
+    # Revert config back
+    original_model = model
+    while hasattr(original_model, "model"):
+        original_model = original_model.model
+        original_model.config = old_config
+    model.config = old_config
+    print("Done.")
+
+    if push_to_hub and hasattr(model, "config"):
+        print(f"Saved merged model to https://huggingface.co/{username}/{save_directory.lstrip('/')}")
+    pass
+
+    save_pretrained_settings["state_dict"] = None
+
+    for j, (key, value) in enumerate(state_dict.items()):
+        state_dict[key] = None
+        if j % 10 == 0:
+            torch.cuda.empty_cache()
+            gc.collect()
+        pass
+    pass
+    state_dict = None
+    del state_dict
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    # Remove temporary location
+    import shutil
+    shutil.rmtree(temporary_location, ignore_errors = True)
+
+    for _ in range(3):
+        torch.cuda.empty_cache()
+        gc.collect()
+    return save_directory, username
+pass
+
+
+def install_llama_cpp_clone_non_blocking():
+    full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"]
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    return run_installer
+pass
+
+
+def install_llama_cpp_make_non_blocking():
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # env = { **os.environ, "LLAMA_CUDA": "1", }
+    n_jobs = max(int(psutil.cpu_count()*1.5), 1)
+    # Force make clean
+    os.system("make clean -C llama.cpp")
+    full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"]
+
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    return run_installer
+pass
+
+
+def install_python_non_blocking(packages = []):
+    full_command = ["pip", "install"] + packages
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    return run_installer
+pass
+
+
+def install_llama_cpp_old(version = -10):
+    # Download the 10th latest release since the latest might be broken!
+    # FALLBACK mechanism
+    releases = subprocess.check_output(["git", "ls-remote", "--tags", "https://github.com/ggerganov/llama.cpp.git"])
+    releases = releases.decode("utf-8").replace("\t", " ").split("\n")
+    for i, x in enumerate(releases):
+        if "refs/tags/b" not in x: break
+    releases = releases[:i]
+    latest = releases[-1]
+    version = releases[version].split(" ")[0]
+
+    # Check if the llama.cpp exists
+    if os.path.exists("llama.cpp"):
+        print(
+            "**[WARNING]** You have a llama.cpp old directory which is broken.\n"\
+            "Unsloth will DELETE the broken directory and install a new one.\n"\
+            "Press CTRL + C / cancel this if this is wrong. We shall wait 10 seconds.\n"
+        )
+        import time
+        for i in range(10):
+            print(f"**[WARNING]** Deleting llama.cpp directory... {10-i} seconds left.")
+            time.sleep(1)
+        import shutil
+        shutil.rmtree("llama.cpp", ignore_errors = True)
+    pass
+
+    # Clone a specific commit
+    # Also don't use the GPU!
+    commands = [
+        "git clone --recursive https://github.com/ggerganov/llama.cpp",
+        f"cd llama.cpp && git reset --hard {version} && git clean -df",
+        "make clean -C llama.cpp",
+        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
+    ]
+    for command in commands:
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+            for line in sp.stdout:
+                line = line.decode("utf-8", errors = "replace")
+                if "undefined reference" in line:
+                    raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+                print(line, flush = True, end = "")
+        pass
+    pass
+    # Check if successful
+    if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"):
+        raise RuntimeError(
+            "Unsloth: The file 'llama.cpp/llama-quantize' or `llama.cpp/quantize` does not exist.\n"\
+            "But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
+        )
+    pass
+pass
+
+
+def install_llama_cpp_blocking(use_cuda = False):
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
+
+    commands = [
+        "git clone --recursive https://github.com/ggerganov/llama.cpp",
+        "make clean -C llama.cpp",
+        # https://github.com/ggerganov/llama.cpp/issues/7062
+        # Weirdly GPU conversion for GGUF breaks??
+        # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
+        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
+        "pip install gguf protobuf",
+    ]
+    if os.path.exists("llama.cpp"): return
+
+    for command in commands:
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+            for line in sp.stdout:
+                line = line.decode("utf-8", errors = "replace")
+                if "undefined reference" in line:
+                    raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+                print(line, flush = True, end = "")
+        pass
+    pass
+pass
+
+
+def get_executable(executables):
+    # Get system locations (System Path).split(system separator)
+    system_directories = os.environ.get("PATH").split(os.pathsep)
+
+    for directory in system_directories:
+        for executable in executables:
+            path = os.path.join(directory, executable)
+            # Check if the executable exists and is executable
+            if os.path.exists(path) and os.access(path, os.X_OK): return path
+        pass
+    pass
+    return None
+pass
+
+
+def save_to_gguf(
+    model_type           : str,
+    model_dtype          : str,
+    is_sentencepiece     : bool = False,
+    model_directory      : str = "unsloth_finetuned_model",
+    quantization_method  = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"]
+    first_conversion     : str = None,
+    _run_installer = None, # Non blocking install of llama.cpp
+):
+    # logger.warning(
+    #     "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
+    #     "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
+    #     "Please be patient - GGUF saving should still work, but might not work as well."
+    # )
+    assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    model_dtype = "f16" if model_dtype == "float16" else "bf16"
+
+    # Convert quantization_method to list
+    if   isinstance(quantization_method, list):  pass
+    elif isinstance(quantization_method, str):   quantization_method = [ quantization_method, ]
+    elif isinstance(quantization_method, tuple): quantization_method = list(quantization_method)
+    else:
+        raise TypeError("Unsloth: quantization_method can only be a string or a list of strings")
+    pass
+    
+    # Check if bfloat16 is supported
+    if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
+        logger.warning(
+            "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
+            "We shall switch instead to f16."
+        )
+        model_dtype = "f16"
+    pass
+
+    # Check first_conversion as well
+    if first_conversion is None:
+        first_conversion = model_dtype
+    pass
+
+    # Check I quants
+    for quant_method in quantization_method: 
+        if quant_method.startswith("iq2"):
+            raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
+    pass
+
+    # Careful convert.py is only for Llama / Mistral based archs
+    use_fast_convert = False
+    if not is_sentencepiece:      use_fast_convert = False # Llama-3
+    elif model_type == "llama":   use_fast_convert = True
+    elif model_type == "mistral": use_fast_convert = True
+    pass
+    logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
+
+    # Map quant methods
+    new_quantization_method = []
+    for quant_method in quantization_method:
+        if   quant_method == "not_quantized":  quant_method = model_dtype
+        elif quant_method == "fast_quantized": quant_method = "q8_0"
+        elif quant_method == "quantized":      quant_method = "q4_k_m"
+        elif quant_method is None:             quant_method = "q8_0"
+
+        # Check if wrong method
+        if quant_method not in ALLOWED_QUANTS.keys():
+            error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n"
+            for key, value in ALLOWED_QUANTS.items():
+                error += f"[{key}] => {value}\n"
+            raise RuntimeError(error)
+        pass
+
+        new_quantization_method.append(quant_method)
+    pass
+    quantization_method = new_quantization_method
+
+    print_info = \
+        f"==((====))==  Unsloth: Conversion from QLoRA to GGUF information\n"\
+        f"   \\\   /|    [0] Installing llama.cpp will take 3 minutes.\n"\
+        f"O^O/ \_/ \\    [1] Converting HF to GGUF 16bits will take 3 minutes.\n"\
+        f"\        /    [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\
+        f' "-____-"     In total, you will have to wait at least 16 minutes.\n'
+    print(print_info)
+
+    # Check first_conversion format
+    if   first_conversion == "f16"  : pass
+    elif first_conversion == "bf16" : pass
+    elif first_conversion == "f32"  : pass
+    elif first_conversion == "q8_0" : pass
+    else:
+        raise RuntimeError(
+            f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`."
+        )
+    pass
+
+    # Determine whether the system already has llama.cpp installed and the scripts are executable
+    quantize_location = get_executable(["llama-quantize", "quantize"])
+    convert_location  = get_executable(["convert-hf-to-gguf.py", "convert_hf_to_gguf.py"])
+    
+    if quantize_location is not None and convert_location is not None:
+        print("Unsloth: llama.cpp found in the system. We shall skip installation.")
+    else:
+        print("Unsloth: [0] Installing llama.cpp. This will take 3 minutes...")
+        if _run_installer is not None:
+            error = _run_installer.wait()
+        else:
+            error = 0
+            install_llama_cpp_blocking()
+        pass
+
+        # Check if successful. If not install 10th latest release
+
+        # Careful llama.cpp/quantize changed to llama.cpp/llama-quantize
+        # and llama.cpp/main changed to llama.cpp/llama-cli
+        # See https://github.com/ggerganov/llama.cpp/pull/7809
+        quantize_location = None
+        if os.path.exists("llama.cpp/quantize"):
+            quantize_location = "llama.cpp/quantize"
+        elif os.path.exists("llama.cpp/llama-quantize"):
+            quantize_location = "llama.cpp/llama-quantize"
+        else:
+            raise RuntimeError(
+                "Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist.\n"\
+                "But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
+            )
+        pass
+
+        # See https://github.com/unslothai/unsloth/pull/730
+        # Filenames changed again!
+        convert_location = None
+        if os.path.exists("llama.cpp/convert-hf-to-gguf.py"):
+            convert_location = "llama.cpp/convert-hf-to-gguf.py"
+        elif os.path.exists("llama.cpp/convert_hf_to_gguf.py"):
+            convert_location = "llama.cpp/convert_hf_to_gguf.py"
+        else:
+            raise RuntimeError(
+                "Unsloth: The file 'llama.cpp/convert-hf-to-gguf.py' or 'llama.cpp/convert_hf_to_gguf.py' does not exist.\n"\
+                "But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
+            )
+        pass
+
+        if error != 0 or quantize_location is None or convert_location is None:
+            print(f"Unsloth: llama.cpp error code = {error}.")
+            install_llama_cpp_old(-10)
+        pass
+    pass
+
+    # Determine maximum first_conversion state
+    if   first_conversion == "f32"  : strength = 3
+    elif first_conversion == "f16"  : strength = 2
+    elif first_conversion == "bf16" : strength = 1
+    elif first_conversion == "q8_0" : strength = 0
+
+    for quant_method in quantization_method:
+        if   quant_method == "f32":  strength = max(strength, 3)
+        elif quant_method == "f16":  strength = max(strength, 2)
+        elif quant_method == "bf16": strength = max(strength, 1)
+        elif quant_method == "q8_0": strength = max(strength, 0)
+        else:
+            # Quantized models must have f16 as the default argument
+            if   first_conversion == "f32"  : pass
+            elif first_conversion == "f16"  : pass
+            elif first_conversion == "bf16" : pass
+            elif first_conversion == "q8_0":
+                logger.warning_once(
+                    "Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
+                    "but saves disk space!"
+                )
+                # first_conversion = "f16"
+            pass
+        pass
+    pass
+
+    # If only q8_0:
+    if len(quantization_method) == 1 and quantization_method[0] == "q8_0":
+        strength = 0
+    pass
+
+    if   strength >= 3: first_conversion = "f32"
+    elif strength >= 2: first_conversion = "f16"
+    elif strength >= 1: first_conversion = "bf16"
+    else: first_conversion = "q8_0"
+
+    # Non llama/mistral needs can only use f32 or f16
+    if not use_fast_convert and \
+        (first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
+
+        pass
+        # Latest llama.cpp works for all models for q8_0!
+
+        # logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.")
+        # first_conversion = "f16"
+    pass
+
+    # Check if bfloat16 is supported
+    if first_conversion == "bf16" and not torch.cuda.is_bf16_supported():
+        logger.warning(
+            "Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
+            "We shall switch instead to f16."
+        )
+        first_conversion = "f16"
+    pass
+
+    n_cpus = psutil.cpu_count()
+    if n_cpus is None: n_cpus = 1
+    n_cpus *= 2
+    # Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
+    
+    final_location = f"./{model_directory}/unsloth.{first_conversion.upper()}.gguf"
+
+    print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\
+          f"The output location will be {final_location}\n"\
+          "This will take 3 minutes...")
+
+    # We first check if tokenizer.model exists in the model_directory
+    if os.path.exists(f"{model_directory}/tokenizer.model"):
+        vocab_type = "spm,hfft,bpe"
+        # Fix Sentencepiece model as well!
+        fix_sentencepiece_gguf(model_directory)
+    else:
+        vocab_type = "bpe"
+    pass
+
+    # convert.py is deprecated!
+    use_fast_convert = False
+    if use_fast_convert:
+        command = f"python llama.cpp/convert.py {model_directory} "\
+            f"--outfile {final_location} --vocab-type {vocab_type} "\
+            f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
+    else:
+        command = f"python {convert_location} {model_directory} "\
+            f"--outfile {final_location} "\
+            f"--outtype {first_conversion}"
+    pass
+
+    with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+        for line in sp.stdout:
+            line = line.decode("utf-8", errors = "replace")
+            if "undefined reference" in line:
+                raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+            print(line, flush = True, end = "")
+        if sp.returncode is not None and sp.returncode != 0:
+            raise subprocess.CalledProcessError(sp.returncode, sp.args)
+    pass
+
+    # Check if quantization succeeded!
+    if not os.path.isfile(final_location):
+        if IS_KAGGLE_ENVIRONMENT:
+            raise RuntimeError(
+                f"Unsloth: Quantization failed for {final_location}\n"\
+                "You are in a Kaggle environment, which might be the reason this is failing.\n"\
+                "Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
+                "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
+                "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
+                "I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
+            )
+        else:
+            raise RuntimeError(
+                f"Unsloth: Quantization failed for {final_location}\n"\
+                "You might have to compile llama.cpp yourself, then run this again.\n"\
+                "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
+                "You must run this in the same folder as you're saving your model.\n"\
+                "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
+                "cd llama.cpp && make clean && make all -j\n"\
+                "Once that's done, redo the quantization."
+            )
+        pass
+    pass
+    print(f"Unsloth: Conversion completed! Output location: {final_location}")
+
+    full_precision_location = final_location
+
+    all_saved_locations = [full_precision_location,]
+    # Convert each type!
+    for quant_method in quantization_method:
+        if quant_method != first_conversion:
+            print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...")
+            final_location = f"./{model_directory}/unsloth.{quant_method.upper()}.gguf"
+
+            command = f"./{quantize_location} {full_precision_location} "\
+                f"{final_location} {quant_method} {n_cpus}"
+            
+            # quantize uses stderr
+            with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+                for line in sp.stdout:
+                    line = line.decode("utf-8", errors = "replace")
+                    if "undefined reference" in line:
+                        raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
+                    print(line, flush = True, end = "")
+                if sp.returncode is not None and sp.returncode != 0:
+                    raise subprocess.CalledProcessError(sp.returncode, sp.args)
+            pass
+
+            # Check if quantization succeeded!
+            if not os.path.isfile(final_location):
+                if IS_KAGGLE_ENVIRONMENT:
+                    raise RuntimeError(
+                        f"Unsloth: Quantization failed for {final_location}\n"\
+                        "You are in a Kaggle environment, which might be the reason this is failing.\n"\
+                        "Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
+                        "This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
+                        "`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
+                        "I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
+                    )
+                else:
+                    raise RuntimeError(
+                        "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
+                        "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
+                        "You must run this in the same folder as you're saving your model.\n"\
+                        "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
+                        "cd llama.cpp && make clean && make all -j\n"\
+                        "Once that's done, redo the quantization."
+                    )
+                pass
+            pass
+
+            print(f"Unsloth: Conversion completed! Output location: {final_location}")
+            all_saved_locations.append(final_location)
+        pass
+    pass
+
+    return all_saved_locations
+pass
+
+
+def unsloth_save_pretrained_merged(
+    self,
+    save_directory       : Union[str, os.PathLike],
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+    tags                 : List[str] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):
+    """
+        Same as .save_pretrained(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.save_pretrained(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"] = self
+    del arguments["self"]
+    unsloth_save_model(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+def unsloth_push_to_hub_merged(
+    self,
+    repo_id              : str,
+    tokenizer            = None,
+    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    token                : Union[bool, str, None] = None,
+    max_shard_size       : Union[int, str, None] = "5GB",
+    create_pr            : bool = False,
+    safe_serialization   : bool = True,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : Optional[List[str]] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.75,
+):
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 with as few overhead as possible.
+
+        Choose for `save_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.push_to_hub(...)`"
+        )
+    pass
+
+    arguments = dict(locals())
+    arguments["model"]          = self
+    arguments["save_directory"] = repo_id
+    arguments["push_to_hub"]    = True
+    del arguments["self"]
+    del arguments["repo_id"]
+    unsloth_save_model(**arguments)
+    for _ in range(3):
+        gc.collect()
+pass
+
+
+MODEL_CARD = \
+"""---
+base_model: {base_model}
+tags:
+- text-generation-inference
+- transformers
+- unsloth
+- {model_type}
+- {extra}
+license: apache-2.0
+language:
+- en
+---
+
+# Uploaded {method} model
+
+- **Developed by:** {username}
+- **License:** apache-2.0
+- **Finetuned from model :** {base_model}
+
+This {model_type} model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
+
+[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
+"""
+
+
+def _determine_username(save_directory, old_username, token):
+    username = ""
+    save_directory = save_directory.lstrip("./")
+    if "/" not in save_directory:
+        from huggingface_hub import whoami
+        try: 
+            username = whoami(token = token)["name"]
+            if type(old_username) is str and username != old_username:
+                username = old_username
+            pass
+            save_directory = f"{username}/{save_directory}"
+        except:
+            raise RuntimeError(f"Unsloth: {save_directory} is not a Huggingface directory.")
+    else:
+        username = save_directory.split("/")[0]
+    pass
+    return save_directory, username
+pass
+
+
+def upload_to_huggingface(
+    model,
+    save_directory,
+    token,
+    method,
+    extra = "",
+    file_location = None,
+    old_username = None,
+    private = None,
+):
+    save_directory, username = _determine_username(save_directory, old_username, token)
+
+    from huggingface_hub import create_repo
+    try:
+        create_repo(
+            repo_id   = save_directory,
+            token     = token,
+            repo_type = "model",
+            exist_ok  = False,
+            private   = private,
+        ) 
+
+        # Create model card
+        from huggingface_hub import ModelCard
+        content = MODEL_CARD.format(
+            username   = username,
+            base_model = model.config._name_or_path,
+            model_type = model.config.model_type,
+            method     = "",
+            extra      = extra,
+        )
+        card = ModelCard(content)
+        card.push_to_hub(save_directory, token = token)
+    except:
+        pass
+
+    if file_location is not None:
+        # Now upload file
+        from huggingface_hub import HfApi
+        hf_api = HfApi(token = token)
+
+        if "/" in file_location:
+            uploaded_location = file_location[file_location.rfind("/")+1:]
+        else:
+            uploaded_location = file_location
+        pass
+
+        # find ftevent file from tensorboard and upload it
+        import glob
+        ftevent_files = glob.glob("*out.tfevents*", recursive = True)
+        if len(ftevent_files) > 0:
+            print("Unsloth: Uploading tensorboard files... Please wait...", file_location + "*out.tfevents*")
+            for ftevent_file in ftevent_files:
+                hf_api.upload_file(
+                    path_or_fileobj = ftevent_file,
+                    path_in_repo    = ftevent_file.replace(file_location, ""),
+                    repo_id         = save_directory,
+                    repo_type       = "model",
+                    commit_message  = "(Trained with Unsloth)",
+                )
+
+        hf_api.upload_file(
+            path_or_fileobj = file_location,
+            path_in_repo    = uploaded_location,
+            repo_id         = save_directory,
+            repo_type       = "model",
+            commit_message  = "(Trained with Unsloth)",
+        )
+
+        # We also upload a config.json file
+        import json
+        with open("_temporary_unsloth_config.json", "w") as file:
+            json.dump({"model_type" : model.config.model_type}, file, indent = 4)
+        pass
+        hf_api.upload_file(
+            path_or_fileobj = "_temporary_unsloth_config.json",
+            path_in_repo    = "config.json",
+            repo_id         = save_directory,
+            repo_type       = "model",
+            commit_message  = "(Trained with Unsloth)",
+        )
+        os.remove("_temporary_unsloth_config.json")
+    pass
+    return username
+pass
+
+
+def fix_tokenizer_bos_token(tokenizer):
+    # Check if BOS added already, then warn
+    fix_bos_token = False
+    chat_template = getattr(tokenizer, "chat_template", None)
+    
+    if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
+        if chat_template is not None and \
+            (
+                tokenizer.bos_token in chat_template or \
+                "{bos_token}" in chat_template.replace(" ", "") or \
+                "{bos_token+" in chat_template.replace(" ", "")
+            ):
+
+            fix_bos_token = True
+            logger.warning(
+                f"Unsloth: ##### The current model auto adds a BOS token.\n"\
+                "Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
+            )
+
+            # Remove {{bos_token}}
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
+            # Remove {{bos_token +
+            new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template)
+            
+            tokenizer.chat_template = new_chat_template
+
+        pass
+    pass
+    return fix_bos_token, chat_template
+pass
+
+
+def create_ollama_modelfile(tokenizer, gguf_location):
+    """
+        Creates an Ollama Modelfile.
+        Use ollama.create(model = "new_ollama_model", modelfile = modelfile)
+    """
+    modelfile = getattr(tokenizer, "_ollama_modelfile", None)
+    if modelfile is None: return None
+
+    modelfile = modelfile\
+        .replace("{{", "⚫@✅#🦥")\
+        .replace("}}", "⚡@🦥#⛵")\
+        .format(
+            __FILE_LOCATION__  = gguf_location,
+        )\
+        .replace("⚫@✅#🦥", "{{")\
+        .replace("⚡@🦥#⛵", "}}")\
+        .rstrip()
+    pass
+
+    return modelfile
+pass
+
+
+def unsloth_save_pretrained_gguf(
+    self,
+    save_directory       : Union[str, os.PathLike],
+    tokenizer            = None,
+    quantization_method  : str = "fast_quantized",
+    first_conversion     : str = None,
+    push_to_hub          : bool = False,
+    token                : Optional[Union[str, bool]] = None,
+    private              : Optional[bool] = None,
+    is_main_process      : bool = True,
+    state_dict           : Optional[dict] = None,
+    save_function        : Callable = torch.save,
+    max_shard_size       : Union[int, str] = "5GB",
+    safe_serialization   : bool = True,
+    variant              : Optional[str] = None,
+    save_peft_format     : bool = True,
+    tags                 : List[str] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.85,
+):
+    """
+        Same as .save_pretrained(...) except 4bit weights are auto
+        converted to float16 then converted to GGUF / llama.cpp format.
+
+        Choose for `quantization_method` to be:
+        "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+        "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+        "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+        "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+        "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+        "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+        "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+        "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+        "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+        "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_s"  : "Uses Q3_K for all tensors",
+        "q4_0"    : "Original quant method, 4-bit.",
+        "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+        "q4_k_s"  : "Uses Q4_K for all tensors",
+        "q4_k"    : "alias for q4_k_m",
+        "q5_k"    : "alias for q5_k_m",
+        "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+        "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+        "q5_k_s"  : "Uses Q5_K for all tensors",
+        "q6_k"    : "Uses Q8_K for all tensors",
+        "iq2_xxs" : "2.06 bpw quantization",
+        "iq2_xs"  : "2.31 bpw quantization",
+        "iq3_xxs" : "3.06 bpw quantization",
+        "q3_k_xs" : "3-bit extra small quantization",
+    """
+    if tokenizer is None:
+        raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
+
+    arguments = dict(locals())
+    arguments["model"]        = self
+    arguments["tokenizer"]    = tokenizer
+    arguments["push_to_hub"]  = False # We save ourselves
+    arguments["save_method"] = "merged_16bit" # Must be 16bit
+    del arguments["self"]
+    del arguments["quantization_method"]
+    del arguments["first_conversion"]
+
+    # Fix tokenizer adding an extra BOS token at the front
+    fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
+
+    # Non blocking install GGUF first
+    if not os.path.exists("llama.cpp"):
+
+        if IS_KAGGLE_ENVIRONMENT:
+            # Kaggle is weird - no blocking installs, and no CUDA?
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda = False)
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            git_clone.wait()
+            makefile  = install_llama_cpp_make_non_blocking()
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            python_install.wait()
+        pass
+    else:
+        try:
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        except:
+            # Retry by recloning llama.cpp
+            if IS_KAGGLE_ENVIRONMENT:
+                # Kaggle is weird - no blocking installs, and no CUDA?
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                python_install.wait()
+                install_llama_cpp_blocking(use_cuda = False)
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                makefile = None
+            else:
+                git_clone = install_llama_cpp_clone_non_blocking()
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                git_clone.wait()
+                makefile  = install_llama_cpp_make_non_blocking()
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                python_install.wait()
+            pass
+        pass
+    pass
+
+    # Use old chat template if the bos is removed
+    if fix_bos_token:
+        tokenizer.chat_template = old_chat_template
+    pass
+
+    for _ in range(3):
+        gc.collect()
+
+    model_dtype = self.config.torch_dtype
+    model_type  = self.config.model_type
+    if type(model_dtype) is str:
+        assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    elif model_dtype == torch.float16:
+        model_dtype = "float16"
+    elif model_dtype == torch.bfloat16:
+        model_dtype = "bfloat16"
+    else:
+        raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
+    pass
+
+    is_sentencepiece_model = check_if_sentencepiece_model(self)
+
+    # Save to GGUF
+    all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
+        new_save_directory, quantization_method, first_conversion, makefile,
+    )
+
+    # Save Ollama modelfile
+    modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
+    modelfile_location = None
+    if modelfile is not None:
+        modelfile_location = os.path.join(new_save_directory, "Modelfile")
+        with open(modelfile_location, "w") as file:
+            file.write(modelfile)
+        pass
+        print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
+    pass
+
+    if fix_bos_token:
+        logger.warning(
+            f"Unsloth: ##### The current model auto adds a BOS token.\n"\
+            "Unsloth: ##### We removed it in GGUF's chat template for you."
+        )
+    pass
+
+    if push_to_hub:
+        print("Unsloth: Uploading GGUF to Huggingface Hub...")
+
+        for file_location in all_file_locations:
+            username = upload_to_huggingface(
+                self, save_directory, token,
+                "GGUF converted", "gguf", file_location, old_username, private,
+            )
+            link = f"{username}/{new_save_directory.lstrip('/.')}" \
+                if username not in new_save_directory else \
+                new_save_directory.lstrip('/.')
+            print(f"Saved GGUF to https://huggingface.co/{link}")
+        pass
+
+        # Save modelfile
+        if modelfile_location is not None:
+            username = upload_to_huggingface(
+                self, save_directory, token,
+                "GGUF converted", "gguf", modelfile_location, old_username, private,
+            )
+            print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
+        pass
+    pass
+pass
+
+
+def unsloth_push_to_hub_gguf(
+    self,
+    repo_id              : str,
+    tokenizer            = None,
+    quantization_method  : str = "fast_quantized",
+    first_conversion     : str = None,
+    use_temp_dir         : Optional[bool] = None,
+    commit_message       : Optional[str] = "Trained with Unsloth",
+    private              : Optional[bool] = None,
+    token                : Union[bool, str, None] = None,
+    max_shard_size       : Union[int, str, None] = "5GB",
+    create_pr            : bool = False,
+    safe_serialization   : bool = True,
+    revision             : str = None,
+    commit_description   : str = "Upload model trained with Unsloth 2x faster",
+    tags                 : Optional[List[str]] = None,
+    temporary_location   : str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage : float = 0.85,
+):
+    """
+        Same as .push_to_hub(...) except 4bit weights are auto
+        converted to float16 then converted to GGUF / llama.cpp format.
+
+        Choose for `quantization_method` to be:
+        "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
+        "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
+        "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
+        "f32"     : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
+        "f16"     : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
+        "q8_0"    : "Fast conversion. High resource use, but generally acceptable.",
+        "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
+        "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
+        "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+        "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
+        "q3_k_s"  : "Uses Q3_K for all tensors",
+        "q4_0"    : "Original quant method, 4-bit.",
+        "q4_1"    : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
+        "q4_k_s"  : "Uses Q4_K for all tensors",
+        "q5_0"    : "Higher accuracy, higher resource usage and slower inference.",
+        "q5_1"    : "Even higher accuracy, resource usage and slower inference.",
+        "q5_k_s"  : "Uses Q5_K for all tensors",
+        "q6_k"    : "Uses Q8_K for all tensors",
+    """
+    if tokenizer is None:
+        raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
+
+    arguments = dict(locals())
+    arguments["model"]          = self
+    arguments["tokenizer"]      = tokenizer
+    arguments["save_directory"] = repo_id
+    arguments["push_to_hub"]    = False # We save ourselves
+    arguments["save_method"]   = "merged_16bit" # Must be 16bit
+    del arguments["self"]
+    del arguments["repo_id"]
+    del arguments["quantization_method"]
+    del arguments["first_conversion"]
+
+    # Fix tokenizer adding an extra BOS token at the front
+    fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
+
+    # Non blocking install GGUF first
+    if not os.path.exists("llama.cpp"):
+
+        if IS_KAGGLE_ENVIRONMENT:
+            # Kaggle is weird - no blocking installs, and no CUDA?
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda = False)
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["gguf", "protobuf"])
+            git_clone.wait()
+            makefile  = install_llama_cpp_make_non_blocking()
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            python_install.wait()
+        pass
+    else:
+        try:
+            new_save_directory, old_username = unsloth_save_model(**arguments)
+            makefile = None
+        except:
+            # Retry by recloning llama.cpp
+            if IS_KAGGLE_ENVIRONMENT:
+                # Kaggle is weird - no blocking installs, and no CUDA?
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                python_install.wait()
+                install_llama_cpp_blocking(use_cuda = False)
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                makefile = None
+            else:
+                git_clone = install_llama_cpp_clone_non_blocking()
+                python_install = install_python_non_blocking(["gguf", "protobuf"])
+                git_clone.wait()
+                makefile  = install_llama_cpp_make_non_blocking()
+                new_save_directory, old_username = unsloth_save_model(**arguments)
+                python_install.wait()
+            pass
+        pass
+    pass
+
+    # Use old chat template if the bos is removed
+    if fix_bos_token:
+        tokenizer.chat_template = old_chat_template
+    pass
+
+    for _ in range(3):
+        gc.collect()
+
+    model_dtype = self.config.torch_dtype
+    model_type  = self.config.model_type
+    if type(model_dtype) is str:
+        assert(model_dtype == "float16" or model_dtype == "bfloat16")
+    elif model_dtype == torch.float16:
+        model_dtype = "float16"
+    elif model_dtype == torch.bfloat16:
+        model_dtype = "bfloat16"
+    else:
+        raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
+    pass
+
+    is_sentencepiece_model = check_if_sentencepiece_model(self)
+
+    # Save to GGUF
+    all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model, 
+        new_save_directory, quantization_method, first_conversion, makefile,
+    )
+
+    # Save Ollama modelfile
+    modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
+    modelfile_location = None
+    if modelfile is not None:
+        modelfile_location = os.path.join(new_save_directory, "Modelfile")
+        with open(modelfile_location, "w") as file:
+            file.write(modelfile)
+        pass
+        print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
+    pass
+
+    for file_location in all_file_locations:
+        print("Unsloth: Uploading GGUF to Huggingface Hub...")
+        username = upload_to_huggingface(
+            self, repo_id, token,
+            "GGUF converted", "gguf", file_location, old_username, private,
+        )
+        link = f"{username}/{new_save_directory.lstrip('/.')}" \
+            if username not in new_save_directory else \
+            new_save_directory.lstrip('/.')
+
+        print(f"Saved GGUF to https://huggingface.co/{link}")
+    pass
+
+    # Save modelfile
+    if modelfile_location is not None:
+        username = upload_to_huggingface(
+            self, repo_id, token,
+            "GGUF converted", "gguf", modelfile_location, old_username, private,
+        )
+        print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
+    pass
+
+    if fix_bos_token:
+        logger.warning(
+            f"Unsloth: ##### The current model auto adds a BOS token.\n"\
+            "Unsloth: ##### We removed it in GGUF's chat template for you."
+        )
+    pass
+pass
+
+# Corrected function to save LoRA to a custom directory
+def save_lora_to_custom_dir(model, tokenizer, save_directory):
+    # Create the custom directory if it doesn't exist
+    os.makedirs(save_directory, exist_ok=True)
+
+    # Call the unsloth_save_model function with the custom directory
+    unsloth_save_model(
+        model,
+        tokenizer,
+        save_directory=save_directory,
+        save_method="lora",
+        push_to_hub=False,
+    )
+
+# Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub
+def unsloth_convert_lora_to_ggml_and_push_to_hub(
+    self,
+    tokenizer,
+    repo_id: str,
+    use_temp_dir: Optional[bool] = None,
+    commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth",
+    private: Optional[bool] = None,
+    token: Union[bool, str, None] = None,
+    create_pr: bool = False,
+    revision: str = None,
+    commit_description: str = "Convert LoRA to GGML format using Unsloth",
+    temporary_location: str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage: float = 0.85,
+):
+    if not os.path.exists("llama.cpp"):
+        if IS_KAGGLE_ENVIRONMENT:
+            python_install = install_python_non_blocking(["protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda=False)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            python_install.wait()
+    else:
+        makefile = None
+
+    for _ in range(3):
+        gc.collect()
+
+    lora_directory_push = "lora-to-ggml-push"
+    save_lora_to_custom_dir(self, tokenizer, lora_directory_push)
+
+    model_type = self.config.model_type
+    output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin")
+
+    print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.")
+    print(f"The output file will be {output_file}")
+
+    command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama"
+
+    try:
+        with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
+            for line in sp.stdout:
+                print(line, end="", flush=True)
+            for line in sp.stderr:
+                print(line, end="", flush=True)
+            sp.wait()
+            if sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, command)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Conversion failed with return code {e.returncode}")
+        return
+
+    print(f"Unsloth: Conversion completed! Output file: {output_file}")
+
+    print("Unsloth: Uploading GGML file to Hugging Face Hub...")
+    username = upload_to_huggingface(
+        self, repo_id, token,
+        "GGML converted LoRA", "ggml", output_file, None, private,
+    )
+    link = f"{repo_id.lstrip('/')}"
+    print("Unsloth: Done.")
+    print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}")
+    print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
+
+def unsloth_convert_lora_to_ggml_and_save_locally(
+    self,
+    save_directory: str, # Added parameter for the folder name 
+    tokenizer, 
+    temporary_location: str = "_unsloth_temporary_saved_buffers",
+    maximum_memory_usage: float = 0.85,
+):
+    if not os.path.exists("llama.cpp"):
+        if IS_KAGGLE_ENVIRONMENT:
+            python_install = install_python_non_blocking(["protobuf"])
+            python_install.wait()
+            install_llama_cpp_blocking(use_cuda=False)
+            makefile = None
+        else:
+            git_clone = install_llama_cpp_clone_non_blocking()
+            python_install = install_python_non_blocking(["protobuf"])
+            git_clone.wait()
+            makefile = install_llama_cpp_make_non_blocking()
+            python_install.wait()
+    else:
+        makefile = None
+
+    for _ in range(3):
+        gc.collect()
+
+    # Use the provided save_directory for local saving
+    save_lora_to_custom_dir(self, tokenizer, save_directory)
+
+    model_type = self.config.model_type
+    output_file = os.path.join(save_directory, "ggml-adapter-model.bin")
+
+    print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.")
+    print(f"The output file will be {output_file}")
+
+    command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama"
+
+    try:
+        with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
+            for line in sp.stdout:
+                print(line, end="", flush=True)
+            for line in sp.stderr:
+                print(line, end="", flush=True)
+            sp.wait()
+            if sp.returncode != 0:
+                raise subprocess.CalledProcessError(sp.returncode, command)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: Conversion failed with return code {e.returncode}")
+        return
+    print("Unsloth: Done.")
+    print(f"Unsloth: Conversion completed! Output file: {output_file}")
+    print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
+
+def patch_saving_functions(model):
+    import inspect
+    import types
+    from typing import Callable, Optional, Union, List
+
+    # And now re add our saving methods!
+    if model.push_to_hub.__name__ == "unsloth_push_to_hub":
+        original_push_to_hub = model.original_push_to_hub
+    else:
+        original_push_to_hub = model.push_to_hub
+    pass
+
+    signature = str(inspect.signature(original_push_to_hub)).replace("NoneType", "None")
+    signature = signature[1:]
+    signature = re.sub("<function save at .+?>", "torch.save", signature)
+    docs = original_push_to_hub.__doc__.encode("utf-8").decode("utf-8")
+
+    push_to_hub_text = f'''def unsloth_push_to_hub(self, {signature}:
+    """
+    {docs}
+    """
+    arguments = dict(locals())
+    del arguments["self"]
+    if "tags" in arguments and arguments["tags"] is not None:
+        assert(isinstance(arguments["tags"], (list, tuple)))
+        arguments["tags"] = list(arguments["tags"]) + ["unsloth",]
+    elif "tags" in arguments:
+        arguments["tags"] = ["unsloth",]
+    elif hasattr(self, "add_model_tags"):
+        self.add_model_tags(["unsloth",])
+
+    if "commit_message" in arguments:
+        commit_message = arguments["commit_message"]
+        if commit_message is not None:
+            if not commit_message.endswith(" "): commit_message += " "
+            if "Unsloth" not in commit_message:
+                commit_message += "(Trained with Unsloth)"
+        else:
+            commit_message = "Upload model trained with Unsloth"
+        arguments["commit_message"] = commit_message
+
+    if "commit_description" in arguments:
+        commit_description = arguments["commit_description"]
+        if commit_description is not None:
+            if not commit_description.endswith(" "): commit_description += " "
+            if "Unsloth" not in commit_description:
+                commit_description += "(Trained with Unsloth 2x faster)"
+        else:
+            commit_description = "Upload model trained with Unsloth 2x faster"
+        arguments["commit_description"] = commit_description
+
+    # Update model tag
+    if hasattr(self, "config"):
+        _ = upload_to_huggingface(
+            self, arguments["repo_id"], arguments["token"],
+            "finetuned", "trl", file_location = None,
+            old_username = None, private = arguments["private"],
+        )
+    pass
+
+    try:
+        self.original_push_to_hub(**arguments)
+    except:
+        del arguments["tags"]
+        self.original_push_to_hub(**arguments)
+    pass
+
+    if hasattr(self, "config"):
+        print("Saved model to https://huggingface.co/" + arguments["repo_id"])
+    pass
+    '''
+    exec(push_to_hub_text, globals())
+
+    original_model = model
+    while True:
+
+        if original_model.push_to_hub.__name__ != "unsloth_push_to_hub":
+            original_model.original_push_to_hub = original_model.push_to_hub
+            original_model.push_to_hub = types.MethodType(unsloth_push_to_hub, original_model)
+            if hasattr(original_model, "add_model_tags"):
+                original_model.add_model_tags(["unsloth",])
+            pass
+        pass
+
+        if hasattr(original_model, "model"): original_model = original_model.model
+        else: break
+    pass
+
+    # Add saving methods to top level model
+    if hasattr(model, "config"):
+        # Counteract tokenizers
+        model.push_to_hub_merged     = types.MethodType(unsloth_push_to_hub_merged,                    model)
+        model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged,                model)
+        model.push_to_hub_gguf       = types.MethodType(unsloth_push_to_hub_gguf,                      model)
+        model.save_pretrained_gguf   = types.MethodType(unsloth_save_pretrained_gguf,                  model)
+        model.push_to_hub_ggml       = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub,  model)
+        model.save_pretrained_ggml   = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
+    pass
+    return model
+pass
--- a/unsloth/unsloth/tokenizer_utils.py
+++ b/unsloth/unsloth/tokenizer_utils.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import AutoTokenizer
+from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from transformers import PreTrainedTokenizerFast
+import re
+import os
+from transformers.models.llama.modeling_llama import logger
+from peft import PeftModelForCausalLM
+import torch
+import itertools
+import collections
+import numpy as np
+import gc
+import subprocess
+
+__all__ = [
+    "load_correct_tokenizer",
+    "fix_sentencepiece_tokenizer",
+    "check_tokenizer",
+    "add_new_tokens",
+    "fix_sentencepiece_gguf",
+]
+
+
+IGNORED_TOKENIZER_CHECKING = frozenset((
+    "CodeLlamaTokenizerFast",
+    "CodeLlamaTokenizer",
+))
+
+
+IGNORED_TOKENIZER_NAMES = [
+    # "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
+    # "unsloth/Mistral-Nemo-Instruct-2407",
+    # "mistralai/Mistral-Nemo-Instruct-2407",
+    # "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
+    # "unsloth/Mistral-Nemo-Base-2407",
+    # "mistralai/Mistral-Nemo-Base-2407",
+]
+IGNORED_TOKENIZER_NAMES = frozenset(
+    [x.lower() for x in IGNORED_TOKENIZER_NAMES]
+)
+
+# Check environments
+keynames = "\n" + "\n".join(os.environ.keys())
+IS_COLAB_ENVIRONMENT  = "\nCOLAB_"  in keynames
+IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
+del keynames
+
+
+def try_fix_tokenizer(tokenizer, prepend = True):
+
+    if hasattr(tokenizer, "_tokenizer"):
+        converted_tokenizer = tokenizer._tokenizer
+    else:
+        converted_tokenizer = convert_slow_tokenizer(tokenizer)
+    pass
+
+    tokenizer_string = converted_tokenizer.to_str()
+
+    # Llama does _apple. Sometimes this is wrong!!
+    prepend_text = '{"type":"Prepend","prepend":"▁"},'
+    if not prepend and prepend_text in tokenizer_string:
+        tokenizer_string = tokenizer_string.replace(prepend_text, "", 1)
+    pass
+
+    dir_names = dir(tokenizer)
+    # Get eos_token, bos_token etc
+    token_names = [x for x in dir_names if x.endswith("_token") and x.count("_") == 1]
+
+    for token_name in token_names:
+        token = getattr(tokenizer, token_name, None)
+        if token is None: continue
+        token_id = getattr(tokenizer, token_name + "_id", None)
+
+        # Locate the token's id mapping in the string
+        find_text = f'"id":{token_id},"content":"'
+        start = tokenizer_string.find(find_text) + len(find_text)
+        if start == -1: continue
+        end   = tokenizer_string.find('",', start)
+
+        bad_token = tokenizer_string[start : end]
+        # Check if token is the actual same one - if not, edit it
+        if bad_token != token:
+            bad_text  = f'{find_text}{bad_token}",'
+            good_text = f'{find_text}{token}",'
+            tokenizer_string = tokenizer_string.replace(bad_text, good_text, 1)
+
+            # And replace vocab section
+            bad_text = f'"{bad_token}":{token_id},'
+            good_text = f'"{token}":{token_id},'
+            tokenizer_string = tokenizer_string.replace(bad_text, good_text, 1)
+        pass
+    pass
+
+    fixed_tokenizer = converted_tokenizer.from_str(tokenizer_string)
+    return fixed_tokenizer
+pass
+
+
+def get_sorted_dict(dictionary):
+    sorted_keys = sorted(dictionary.values())
+    inverted_dictionary = { value : key for key, value in dictionary.items() }
+
+    sorted_dictionary = {}
+    for key in sorted_keys:
+        value = inverted_dictionary[key]
+        sorted_dictionary[value] = key
+    return sorted_dictionary
+pass
+
+
+def convert_to_fast_tokenizer(
+    slow_tokenizer,
+    temporary_location = "_unsloth_sentencepiece_temp",
+):
+    is_fast = getattr(slow_tokenizer, "is_fast", False)
+    if is_fast: return slow_tokenizer
+    
+    try:
+        tokenizer_name = slow_tokenizer.__class__.__name__
+        lowered_tokenizer_name = tokenizer_name.lower()
+        if lowered_tokenizer_name.endswith("tokenizer"):
+            class_name = lowered_tokenizer_name[:-len("tokenizer")]
+            FastTokenizer = eval(
+                f'__import__(f"transformers.models.{class_name}").{tokenizer_name}Fast'
+            )
+        else:
+            FastTokenizer = PreTrainedTokenizerFast
+    except:
+        FastTokenizer = PreTrainedTokenizerFast
+    pass
+
+    # Get all arguments (bos_token, etc)
+    docs = FastTokenizer.__doc__
+    docs = docs[docs.find("Args:"):]
+    args = re.findall(r"\n[\s]+([^\s]{1,}) \(", docs, flags = re.MULTILINE)
+    args = [x for x in args if not x.endswith("_file")]
+
+    # Also some missing maybe!
+    docs = PreTrainedTokenizerFast.__doc__
+    docs = docs[docs.find("Args:"):]
+    args2 = re.findall(r"\n[\s]+([^\s]{1,}) \(", docs, flags = re.MULTILINE)
+    args2 = [x for x in args2 if not x.endswith("_file")]
+    args = list(set(args + args2))
+
+    kwargs = {}
+    for arg in args: kwargs[arg] = getattr(slow_tokenizer, arg, None)
+    kwargs["tokenizer_object"] = try_fix_tokenizer(slow_tokenizer, prepend = True)
+    fast_tokenizer = FastTokenizer( **kwargs )
+
+    # Check if they're similar!
+    sorted_slow_tokenizer = get_sorted_dict(slow_tokenizer.get_vocab())
+    sorted_fast_tokenizer = get_sorted_dict(fast_tokenizer.get_vocab())
+
+    check_vocab   = (sorted_slow_tokenizer == sorted_fast_tokenizer)
+    check_special = (slow_tokenizer.all_special_tokens == fast_tokenizer.all_special_tokens)
+
+    # Failure so return slow_tokenizer
+    if not check_vocab or not check_special: return slow_tokenizer
+
+    # Now confirm if they match
+    if not assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+        # Maybe remove prepending of __apple?
+        kwargs["tokenizer_object"] = try_fix_tokenizer(slow_tokenizer, prepend = False)
+        fast_tokenizer = FastTokenizer( **kwargs )
+        if not assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+            # Failure :(
+            return slow_tokenizer
+        pass
+    pass
+
+    # Also tokenizer.model is missing!
+    name = slow_tokenizer.name_or_path.replace("/", "_")
+    if not os.path.exists(temporary_location):
+        os.makedirs(temporary_location)
+    pass
+    new_location = f"{temporary_location}/{name}"
+    slow_tokenizer.save_pretrained(new_location)
+    fast_tokenizer.save_pretrained(new_location)
+
+    # Now load it!
+    fast_tokenizer = AutoTokenizer.from_pretrained(new_location)
+    if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+        return fast_tokenizer
+    return slow_tokenizer
+pass
+
+
+# Check Mistral chat template without BOS / EOS
+mistral_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'] + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ message['content'] }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+# Check Llama chat template without BOS / EOS
+llama_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{% if messages[1]['role'] == 'user' %}"\
+            "{{ '[INST] <<SYS>>\n' + messages[0]['content'] + '\n<</SYS>>\n\n' + messages[1]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[2:] %}"\
+        "{% else %}"\
+            "{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
+            "{% set loop_messages = messages[1:] %}"\
+        "{% endif %}"\
+    "{% else %}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ '[INST] ' + message['content'].strip() + ' [/INST]' }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ ' ' + message['content'].strip() + ' ' }}"\
+        "{% else %}"\
+            "{{ raise_exception('Only user and assistant roles are supported!') }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+pass
+
+
+def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+    # Get eos_token, bos_token etc
+    dir_names = dir(slow_tokenizer)
+    special_tokens = list(filter(None, (
+        getattr(slow_tokenizer, x) for x in dir_names
+        if x.endswith("_token") and x.count("_") == 1
+    )))
+    all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens))
+
+    # Check if chat template is enabled!
+    check_chat_template1 = True
+    check_chat_template2 = True
+    check_chat_template3 = True
+    
+    """
+    Weirdly Mistral tokenizers are actually correct??
+    Ie below will actually load mistral v1 and v3 incorrectly!
+
+    slow_chat_template = getattr(slow_tokenizer, "chat_template", None)
+    fast_chat_template = getattr(fast_tokenizer, "chat_template", None)
+    messages = [
+        {"role": "user", "content": " What is 2+2? "},
+        {"role": "assistant", "content": " It's 4. "},
+    ]
+    # Check the tokenizer's own chat template
+    if slow_chat_template is not None and fast_chat_template is not None:
+        check_chat_template1 = \
+            slow_tokenizer.apply_chat_template(messages) == \
+            fast_tokenizer.apply_chat_template(messages)
+    pass
+
+    # Check Mistral chat template without BOS / EOS
+    slow_tokenizer.chat_template = mistral_template
+    fast_tokenizer.chat_template = mistral_template
+    check_chat_template2 = \
+        slow_tokenizer.apply_chat_template(messages) == \
+        fast_tokenizer.apply_chat_template(messages)
+    pass
+
+    # Check Llama chat template without BOS / EOS
+    slow_tokenizer.chat_template = llama_template
+    fast_tokenizer.chat_template = llama_template
+    check_chat_template3 = \
+        slow_tokenizer.apply_chat_template(messages) == \
+        fast_tokenizer.apply_chat_template(messages)
+    pass
+
+    # Combine them all and revert chat templates
+    slow_tokenizer.chat_template = slow_chat_template
+    fast_tokenizer.chat_template = fast_chat_template
+    """
+    check_chat_template = check_chat_template1 and check_chat_template2 and check_chat_template3
+
+    # Try special tokens
+    try:
+        string = "\n".join(all_special_tokens) + \
+            "A quick brown fox jumps over the lazy dog!!\n\nHi</s>\n\n" + \
+            "".join(all_special_tokens)
+        check_special_tokens = \
+            slow_tokenizer(string).input_ids == \
+            fast_tokenizer(string).input_ids
+
+        return check_chat_template and check_special_tokens
+    except:
+        # For eg see https://github.com/unslothai/unsloth/issues/292
+        # Sometimes tokenizer has weird tokens, causing a combined tokenization to fail.
+        # [TODO] We temporarily disable this for CodeLlama tokenizers
+        if slow_tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
+            return check_chat_template
+        else:
+            return False
+    pass
+pass
+
+
+def fix_sentencepiece_tokenizer(
+    old_tokenizer,
+    new_tokenizer,
+    token_mapping,
+    temporary_location = "_unsloth_sentencepiece_temp",
+):
+    # From https://github.com/google/sentencepiece/issues/121
+    # We need to manually edit the sentencepiece tokenizer!
+    from transformers.utils import sentencepiece_model_pb2
+
+    if not os.path.exists(temporary_location):
+        os.makedirs(temporary_location)
+    pass
+
+    # Check if tokenizer.model exists
+    if not os.path.isfile(f"{temporary_location}/tokenizer.model"):
+        return new_tokenizer
+    pass
+
+    # First save the old tokenizer
+    old_tokenizer.save_pretrained(temporary_location)
+
+    tokenizer_file = sentencepiece_model_pb2.ModelProto()
+    tokenizer_file.ParseFromString(open(f"{temporary_location}/tokenizer.model", "rb").read())
+
+    # Now save the new tokenizer
+    new_tokenizer.save_pretrained(temporary_location)
+
+    # Now correct the old tokenizer's .model file
+    for old_token, new_token in token_mapping.items():
+        ids = old_tokenizer([old_token], add_special_tokens = False).input_ids
+        ids = ids[0]
+        if (len(ids) != 1):
+            # Skip this token!
+            print(f"Skip mapping {old_token} to {new_token} since {new_token} is already in the tokenizer!")
+            continue
+        pass
+        ids = ids[0]
+        # [TODO] Hack for Starling - try except
+        try:
+            tokenizer_piece = tokenizer_file.pieces[ids]
+        except:
+            continue
+        assert(tokenizer_piece.piece == old_token)
+        tokenizer_piece.piece = new_token
+    pass
+
+    # And now write it
+    with open(f"{temporary_location}/tokenizer.model", "wb") as file:
+        file.write(tokenizer_file.SerializeToString())
+    pass
+
+    # And load it!
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        temporary_location,
+        eos_token = new_tokenizer.eos_token,
+        pad_token = new_tokenizer.pad_token,
+    )
+    return tokenizer
+pass
+
+
+def fix_sentencepiece_gguf(saved_location):
+    """
+        Fixes sentencepiece tokenizers which did not extend the vocabulary with
+        user defined tokens.
+        Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py
+    """
+    from copy import deepcopy
+    from transformers.utils import sentencepiece_model_pb2
+    import json
+    from enum import IntEnum
+    
+    class SentencePieceTokenTypes(IntEnum):
+        NORMAL = 1
+        UNKNOWN = 2
+        CONTROL = 3
+        USER_DEFINED = 4
+        UNUSED = 5
+        BYTE = 6
+    pass
+
+    # Load tokenizer.model
+    tokenizer_file = sentencepiece_model_pb2.ModelProto()
+    if not os.path.isfile(f"{saved_location}/tokenizer.model"): return
+    tokenizer_file.ParseFromString(open(f"{saved_location}/tokenizer.model", "rb").read())
+    sentence_piece_size = len(tokenizer_file.pieces)
+
+    # Load added_tokens_json
+    if not os.path.isfile(f"{saved_location}/added_tokens.json"): return
+    with open(f"{saved_location}/added_tokens.json", "r", encoding = "utf-8") as file:
+        added_tokens_json = json.load(file)
+    pass
+    if len(added_tokens_json) == 0: return
+
+    added_tokens_json = dict(sorted(added_tokens_json.items(), key = lambda item: item[1]))
+    new_size = sentence_piece_size + len(added_tokens_json)
+
+    # Confirm added_tokens_json is correct
+    added_tokens_ids = np.array(list(added_tokens_json.values()))
+    diff = np.diff(added_tokens_ids)
+    if (diff.min() != 1 or diff.max() != 1): return
+    if (added_tokens_ids.min() != sentence_piece_size): return
+
+    # Edit sentence piece tokens with added_tokens_json
+    logger.warning(
+        f"Unsloth: Extending {saved_location}/tokenizer.model with added_tokens.json.\n"\
+        f"Originally tokenizer.model is of size ({sentence_piece_size}).\n"\
+        f"But we need to extend to sentencepiece vocab size ({new_size})."
+    )
+    new_tokens = deepcopy(tokenizer_file.pieces[-len(added_tokens_ids):])
+    for new_token, added_token in zip(new_tokens, added_tokens_json.keys()):
+        new_token.piece = added_token.encode("utf-8")
+        new_token.score = -1000.0
+        new_token.type  = SentencePieceTokenTypes.USER_DEFINED
+    pass
+
+    tokenizer_file.pieces.extend(new_tokens)
+
+    with open(f"{saved_location}/tokenizer.model", "wb") as file:
+        file.write(tokenizer_file.SerializeToString())
+    pass
+
+    # Add padding tokens
+    # actual_vocab_size = model.config.vocab_size
+    # padding = actual_vocab_size - len(tokenizer_file.pieces)
+    return
+pass
+
+
+def load_correct_tokenizer(
+    tokenizer_name,
+    model_max_length = None,
+    padding_side = "right",
+    token = None,
+    trust_remote_code = False,
+    cache_dir = "huggingface_tokenizers_cache",
+):
+    if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
+        cache_dir = cache_dir
+    else:
+        cache_dir = None
+    pass
+
+    # Try loading the slow tokenizer. If it fails, then try Fast only
+    # Mainly to solve Deepseek models with no tokenizer.model file
+    slow_tokenizer = None
+    try:
+        slow_tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            model_max_length  = model_max_length,
+            padding_side      = padding_side,
+            token             = token,
+            trust_remote_code = trust_remote_code,
+            # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
+            use_fast          = False,
+            legacy            = False,
+            from_slow         = True,
+            cache_dir         = cache_dir,
+        )
+    except:
+        pass
+        # print(
+        #     f"Unsloth: {tokenizer_name} has no tokenizer.model file.\n"\
+        #     "Just informing you about this - this is not a critical error."
+        # )
+    pass
+
+    fast_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name,
+        model_max_length  = model_max_length,
+        padding_side      = padding_side,
+        token             = token,
+        trust_remote_code = trust_remote_code,
+        cache_dir         = cache_dir,
+    )
+
+    if tokenizer_name in IGNORED_TOKENIZER_NAMES:
+        return fast_tokenizer
+    elif slow_tokenizer is not None:
+        if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"):
+            fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token
+        if hasattr(fast_tokenizer, "add_eos_token") and hasattr(slow_tokenizer, "add_eos_token"):
+            fast_tokenizer.add_eos_token = slow_tokenizer.add_eos_token
+        
+        # Confirm if slow and fast are equivalent!
+        if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
+            return fast_tokenizer
+        else:
+            logger.warning(f"Unsloth: Will load {tokenizer_name} as a legacy tokenizer.")
+            return convert_to_fast_tokenizer(slow_tokenizer)
+        pass
+    else:
+        return fast_tokenizer
+    pass
+pass
+
+
+def check_tokenizer(
+    model,
+    tokenizer,
+    model_name = "unsloth/llama-2-7b-bnb-4bit",
+    model_max_length = 4096,
+    padding_side = "right",
+    token = None,
+    _reload = True,
+):
+    # Checks tokenizer for out of bounds ids.
+    # Mainly a fix for https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
+    # where <sep> had token id=32002.
+    # See https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha/discussions/25
+    # Seems like the Fast tokenizer in Rust breaks things!
+
+    # We ignore some of them!
+    if tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
+        return tokenizer
+    pass
+
+    max_embedding_size = model.model.embed_tokens.weight.shape[0]
+    added_tokens_fast = tokenizer.added_tokens_decoder
+    added_tokens_fast = {index : str(value) for index, value in added_tokens_fast.items()}
+    sorted_keys = sorted(added_tokens_fast)
+    added_tokens_fast = {key : added_tokens_fast[key] for key in sorted_keys}
+
+    for j, index in enumerate(added_tokens_fast.keys()):
+        if index >= max_embedding_size:
+            bad_indices = list(added_tokens_fast.keys  ())[j:]
+            bad_tokens  = list(added_tokens_fast.values())[j:]
+            if not _reload:
+                # Try removing the token
+                added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
+                special_tokens = tokenizer.special_tokens_map
+                import itertools
+                special_tokens = frozenset(
+                    itertools.chain.from_iterable(
+                        [x] if type(x) is str else x for x in special_tokens.values()
+                    )
+                )
+                can_be_removed1 = [x for x in bad_tokens if x not in special_tokens]
+                can_be_removed2 = [x for x in can_be_removed1 if x in tokenizer._added_tokens_encoder.keys()]
+
+                # Check of extra tokens can in fact we removed!
+                can_be_removed = \
+                    (len(can_be_removed1) == len(bad_tokens)) and \
+                    (len(can_be_removed2) == len(bad_tokens))
+
+                # Check if sep_token or other generic types
+                remove_generic = False
+                try_mapper = []
+                if not can_be_removed:
+                    names = dir(tokenizer)
+                    names = (x for x in names if x.endswith("_token") and x.count("_") == 1)
+                    generic_tokens = [(x, getattr(tokenizer, x, None)) for x in names]
+
+                    try_removal = []
+                    for token in bad_tokens:
+                        for (name_token, check_token) in generic_tokens:
+                            if check_token == token:
+                                try_removal.append(token)
+                                try_mapper.append(name_token)
+                            pass
+                        pass
+                    pass
+
+                    # Recheck!
+                    can_be_removed = (len(try_removal) == len(bad_tokens))
+                    if can_be_removed: remove_generic = True
+                    can_be_removed1 = bad_tokens
+                pass
+
+                if can_be_removed:
+                    # Yes it can be fixed!
+                    for j, bad_token in enumerate(can_be_removed1):
+                        remove_id = tokenizer._added_tokens_encoder[bad_token]
+                        del tokenizer._added_tokens_decoder[remove_id]
+                        del tokenizer._added_tokens_encoder[bad_token]
+
+                        if remove_generic and (try_removal[j] == bad_token):
+                            # Remove sep token for example
+                            setattr(tokenizer, try_mapper[j], None)
+                            setattr(tokenizer, try_mapper[j] + "_id", None)
+                        pass
+                    pass
+                    # Confirm 1 more time!
+                    if max(tokenizer.added_tokens_decoder.keys()) < max_embedding_size:
+                        logger.warning_once(
+                            f"Unsloth loaded a broken tokenizer `{model_name}`, but managed to repair it!\n"\
+                            f"Tokens {bad_tokens} with ids {bad_indices} exceeds the max vocab size of {max_embedding_size}.\n"\
+                            "We removed these bad tokens. If you think this is incorrect, fix your tokenizer first."
+                        )
+                        return convert_to_fast_tokenizer(tokenizer)
+                    pass
+                pass
+
+                # :( Failure
+                raise RuntimeError(
+                    f"Unsloth tried to load `{model_name}`, but cannot succeed.\n"\
+                    f"Tokens {bad_tokens} with ids {bad_indices} exceeds the max vocab size of {max_embedding_size}.\n"\
+                    f"Fix your tokenizer since it'll perform out of bounds memory accesses."
+                )
+            pass
+            
+            if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
+                cache_dir = "huggingface_tokenizers_cache"
+            else:
+                cache_dir = None
+            pass
+
+            # Sometimes slow tokenizer does not work like Deepseek
+            try:
+                # Try slow tokenizer which can fix things!
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_name,
+                    model_max_length = model_max_length,
+                    padding_side = padding_side,
+                    token = token,
+                    # Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
+                    use_fast = False,
+                    legacy = False,
+                    from_slow = True,
+                    cache_dir = cache_dir,
+                )
+                return check_tokenizer(
+                    model = model,
+                    tokenizer = tokenizer,
+                    model_name = model_name,
+                    model_max_length = model_max_length,
+                    padding_side = padding_side,
+                    token = token,
+                    _reload = False,
+                )
+                break
+            except:
+                # Tokenizer has out of bounds issues and we can't
+                # load the slow tokenizer version :(
+                logger.warning_once(
+                    "Unsloth: Tokenizer is most likely buggy, and Unsloth failed to repair it.\n"\
+                    "It will still work, but beware of out of bounds memory accesses.\n"\
+                    "Please file an issue on the model owner's repo about this issue."
+                )
+                return tokenizer
+            pass
+        pass
+    pass
+    return convert_to_fast_tokenizer(tokenizer)
+pass
+
+
+@torch.inference_mode
+def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
+    """
+    Llama-3 for eg has untrained vectors in the base model.
+    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
+    We reset them to the mean of the rest of the tokens
+    """
+    embedding_matrix = model.get_input_embeddings ().weight
+    lm_head_matrix   = model.get_output_embeddings().weight
+
+    # Ignore some model checks for now
+    if model.config._name_or_path in  IGNORED_TOKENIZER_NAMES:
+        return
+    pass
+
+    # Get untrained tokens
+    indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps
+    # Check lm_head as well
+    indicator_untrained2 = torch.amax(lm_head_matrix,   axis = 1) <= eps
+    # Combine both checks
+    indicator_untrained = indicator_untrained1 & indicator_untrained2
+    
+    where_untrained = torch.where(indicator_untrained)[0]
+    n_untrained = where_untrained.shape[0]
+    n_trained = embedding_matrix.shape[0] - n_untrained
+
+    # Get set and actual tokens
+    where_untrained = where_untrained.tolist()
+    if len(where_untrained) == 0: return
+
+    # Remove untrained indices where it's longer
+    
+    where_untrained_set = frozenset(where_untrained)
+    actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
+    # Remove None items in actual_bad_tokens
+    actual_bad_tokens = [x for x in actual_bad_tokens if x is not None]
+
+    # Check if tokenizer and training datasets have bad tokens
+    if_bad_first  = False
+    if_bad_second = False
+    # Check tokenizer's chat template for any untrained tokens
+    chat_template = getattr(tokenizer, "chat_template", None)
+    if chat_template is not None:
+        if_bad_first = any(x in chat_template for x in actual_bad_tokens)
+    pass
+
+    # Check the first 250, last 250 input_ids
+    size_dataset = len(train_dataset)
+    size = min(size_dataset, 250)
+    for j in range(size):
+        input_ids = train_dataset[j]
+        if "input_ids" in input_ids:
+            input_ids = input_ids["input_ids"]
+            if_bad = any(item in where_untrained_set for item in input_ids)
+            if if_bad:
+                if_bad_second = True
+                break
+            pass
+        pass
+    pass
+
+    # Check last 250
+    if not if_bad_second:
+        left = max(size_dataset-250, 0)
+        for j in range(left, size_dataset):
+            input_ids = train_dataset[j]
+            if "input_ids" in input_ids:
+                input_ids = input_ids["input_ids"]
+                if_bad = any(item in where_untrained_set for item in input_ids)
+                if if_bad:
+                    if_bad_second = True
+                    break
+                pass
+            pass
+        pass
+    pass
+
+    # Check if bad tokens exists!
+    if not if_bad_first and not if_bad_second: return
+
+    # Check if lm_head / embed_token are trainable!
+    bad_not_trainable = False
+    if not embedding_matrix.requires_grad: bad_not_trainable = True
+    if not lm_head_matrix  .requires_grad: bad_not_trainable = True
+
+    if bad_not_trainable:
+        raise ValueError(
+            'Unsloth: Untrained tokens found, but embed_tokens & lm_head not trainable, causing NaNs. '\
+            'Restart then add `embed_tokens` & `lm_head` to '\
+            '`FastLanguageModel.get_peft_model(target_modules = [..., "embed_tokens", "lm_head",]). `'\
+            'Are you using the `base` model? Instead, use the `instruct` version to silence this warning.',
+        )
+    pass
+
+    # Count all the possible bad tokens
+    final_counts = np.zeros(max(len(tokenizer), embedding_matrix.shape[0]), dtype = np.int64)
+    def mapping(examples):
+        input_ids = examples["input_ids"]
+        counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype = np.int32)
+        np.add.at(final_counts, counter, 1)
+    pass
+    train_dataset.map(mapping, batched = True, desc = "Counting untrained tokens")
+
+    # Get sum of all items
+    sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    sum_lm_head   = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
+
+    # Remove bad tokens
+    sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0)
+    sum_lm_head   -= torch.sum(lm_head_matrix  [where_untrained], dtype = torch.float32, axis = 0)
+
+    # Find correct average by dividing by sum of trained tokens
+    mean_embedding = (sum_embedding / n_trained)
+    mean_lm_head   = (sum_lm_head   / n_trained)
+
+    # Scale each to be equal to 1/max_frequency. Also set some to 0 if none seen
+    scaling = final_counts[where_untrained] / max(final_counts.max(), 1)
+    scaling = torch.tensor(scaling, device = mean_embedding.device).unsqueeze(1)
+    mean_embedding = mean_embedding.repeat((n_untrained, 1,)) * scaling
+    mean_lm_head   = mean_lm_head  .repeat((n_untrained, 1,)) * scaling
+    where_null = scaling.ravel() == 0
+    mean_embedding[where_null] = 0
+    mean_lm_head  [where_null] = 0
+
+    # Set them to the mean
+    logger.warning(
+        "Unsloth: Setting embed_tokens & lm_head untrained tokens to "\
+        "mean(trained) to counteract NaNs during training."
+    )
+    embedding_matrix[where_untrained] = mean_embedding.to(embedding_matrix.dtype)
+    lm_head_matrix  [where_untrained] = mean_lm_head  .to(lm_head_matrix  .dtype)
+
+    # Clean up
+    for _ in range(3):
+        gc.collect()
+        torch.cuda.empty_cache()
+    pass
+    return
+pass
+
+
+@torch.inference_mode
+def mean_of_trained_tokens(model, eps = 1e-16):
+    """
+    Llama-3 for eg has untrained vectors in the base model.
+    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
+    We reset them to the mean of the rest of the tokens
+    """
+    embedding_matrix = model.get_input_embeddings ().weight.clone()
+    lm_head_matrix   = model.get_output_embeddings().weight.clone()
+
+    # Get untrained tokens
+    indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
+    where_untrained = torch.where(indicator_untrained)[0]
+    n_untrained = where_untrained.shape[0]
+    n_trained = embedding_matrix.shape[0] - n_untrained
+    # if n_untrained != 0:
+    #     print(
+    #         f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
+    #         "We shall set them to the mean of the other trained tokens."
+    #     )
+    # pass
+
+    # Get sum of all items
+    sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    sum_lm_head   = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
+
+    # Remove bad tokens
+    sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0)
+    sum_lm_head   -= torch.sum(lm_head_matrix  [where_untrained], dtype = torch.float32, axis = 0)
+
+    # Find correct average by dividing by sum of trained tokens
+    mean_embedding = (sum_embedding / n_trained)
+    mean_lm_head   = (sum_lm_head   / n_trained)
+
+    return mean_embedding, mean_lm_head
+pass
+
+
+@torch.inference_mode
+def add_new_tokens(
+    model,
+    tokenizer,
+    new_tokens = [],
+    method = "mean",
+    interpolation = 0.5,
+):
+    """
+    Smartly resizes the tokenizer and adds new tokens to the model.
+    We also disregard untrained tokens by removing them from the mean calculation.
+    """
+    assert(isinstance(new_tokens, (list, tuple)))
+    assert(len(new_tokens) > 0)
+    assert(method == "mean" or method == "interpolation")
+    assert(interpolation >= 0 and interpolation <= 1)
+
+    # Check if tokens already exist
+    overlapping_tokens = set(new_tokens) & set(tokenizer.vocab.keys())
+    if len(overlapping_tokens) != 0:
+        print(
+            f"Unsloth: You're adding new_tokens = {new_tokens}\n"\
+            f"There are tokens which are overlapping = {list(overlapping_tokens)}\n"\
+            f"We shall safely ignore these overlapping tokens."
+        )
+        new_tokens = [x for x in new_tokens if x not in overlapping_tokens]
+    pass
+
+    # Get mean of trained tokens
+    # mean_embedding, mean_lm_head = fix_untrained_tokens(model)
+
+    # Weirdly be careful reserved tokens can pop out
+    mean_embedding, mean_lm_head = mean_of_trained_tokens(model)
+    mean_embedding = mean_embedding.to(torch.float32)
+    mean_lm_head   = mean_lm_head  .to(torch.float32)
+
+    # Add tokens!
+    old_length = len(tokenizer)
+    tokenizer.add_tokens(new_tokens)
+    model.resize_token_embeddings(len(tokenizer))
+
+    # If we use interpolation, we interpolate between the mean embeddings and
+    # the Word2Vec sum of the other vectors
+    embedding_matrix = model.get_input_embeddings ().weight
+    lm_head_matrix   = model.get_output_embeddings().weight
+
+    if method == "interpolation":
+        print(
+            "Unsloth: You are using interpolation to add new tokens.\n"\
+            f"We shall set new tokens = mean(embeddings)*{1-interpolation} + mean(new_tokens)*{interpolation}"
+        )
+        for j, token in enumerate(new_tokens):
+            input_ids = tokenizer(token, add_special_tokens = False).input_ids
+            mean_embedding_token = embedding_matrix[input_ids].mean(axis = 0, dtype = torch.float32)
+            mean_lm_head_token   = lm_head_matrix  [input_ids].mean(axis = 0, dtype = torch.float32)
+
+            # Interpolate
+            mean_embedding_token = mean_embedding*(1-interpolation) + mean_embedding_token*interpolation
+            mean_lm_head_token   = mean_lm_head  *(1-interpolation) + mean_lm_head_token  *interpolation
+
+            # Set the new vector
+            embedding_matrix[old_length+j] = mean_embedding_token
+            lm_head_matrix  [old_length+j] = mean_lm_head_token
+        pass
+    else:
+        # Now set the new tokens to the mean!
+        embedding_matrix[old_length:] = mean_embedding
+        lm_head_matrix  [old_length:] = mean_lm_head
+    pass
+
+    # We set a flag to say we need to train embeddings
+    internal_model = model
+    while hasattr(internal_model, "model"):
+        internal_model._need_to_train_embeddings = True
+        internal_model = internal_model.model
+    pass
+    internal_model._need_to_train_embeddings = True
+    
+    return
+pass
+
+
+def check_nvidia():
+    # Unsloth doesn't work yet on AMD devices - we're working on it!
+    output = np.array([0,])
+    try:
+        output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
+        output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
+        output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+    except:
+        if not torch.cuda.is_available():
+            raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
+    return output
+pass
+PRE_CHECK = check_nvidia()
+
+
+from inspect import getsource
+import trl.trainer.sft_trainer
+from trl.trainer.sft_trainer import *
+from transformers.trainer import *
+
+def patch_sft_trainer_tokenizer():
+    """
+        Patches the trainer with changes
+    """
+    for function_name, replacer in (
+        ("_prepare_non_packed_dataloader", "def tokenize(element):",),
+        # ("_prepare_packed_dataloader", "if dataset_text_field is not None",),
+    ):
+        function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
+        where = function.find("def")
+        function = function.split("\n")
+        function = "\n".join(x[where:] for x in function)
+
+        check_text = \
+        "\n"\
+        "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])[0]\n"\
+        "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
+        "chat_template = '' if chat_template is None else chat_template\n"\
+        "has_bos_token_already = (test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template) "\
+        "if getattr(tokenizer, 'bos_token', None) is not None else False\n"\
+        "add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n"
+
+        check_text = check_text.split("\n")
+        check_text = "\n".join(" "*where + x for x in check_text)
+
+        function = function.replace(replacer, check_text + replacer)
+        exec(function, globals())
+
+        exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
+    pass
+
+    # Patch train with fix_untrained_tokens
+    function_name, replacer = "train", "if resume_from_checkpoint is False:"
+    function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+
+    check_text = \
+    "\n"\
+    "if self._inner_training_loop.__name__ != '_fast_inner_training_loop':\n"\
+    "    raise RuntimeError(\n"\
+    "       'Please do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'\n"\
+    "    )\n"\
+    "pass\n"\
+    "import subprocess, re, gc, numpy as np\n"\
+    "a = np.array([0,])\n"\
+    "try:\n"\
+    "    a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
+    "    a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
+    "    a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
+    "except:\n"\
+    "    if not torch.cuda.is_available():\n"\
+    "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
+    "if ((a - PRE_CHECK) >= 1).sum() > 1:\n"\
+    "    raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
+    "for _ in range(3):\n"\
+    "    gc.collect()\n"\
+    "    torch.cuda.empty_cache()\n"\
+    "pass\n"\
+    "\n"\
+    "fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
+
+    check_text = check_text.split("\n")
+    check_text = "\n".join(" "*where + x for x in check_text)
+
+    function = function.replace(replacer, check_text + replacer)
+    exec(function, globals())
+
+    exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
+pass
+
+patch_sft_trainer_tokenizer()
--- a/unsloth/unsloth/trainer.py
+++ b/unsloth/unsloth/trainer.py
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+from transformers import TrainingArguments
+from trl import SFTTrainer
+from . import is_bfloat16_supported
+
+__all__ = [
+    "UnslothTrainingArguments",
+    "UnslothTrainer",
+]
+
+
+@dataclass
+class UnslothTrainingArguments(TrainingArguments):
+    embedding_learning_rate : Optional[float] = field(
+        default = None,
+        metadata = {"help" : "Different learning rates for embeddings and lm_head."}
+    )
+pass
+
+
+def _create_unsloth_optimizer(
+    model,
+    optimizer_cls,
+    optimizer_kwargs,
+    embedding_lr = 5e-5,
+):
+    lr = optimizer_kwargs["lr"]
+    weight_decay = optimizer_kwargs.get("weight_decay", 0.0)
+
+    param_groups = \
+    {
+        "non_embeddings" : {},
+        "embeddings"     : {},
+    }
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad: continue
+        if name.endswith("modules_to_save.default.weight"):
+            partial_name = name[:-len(".modules_to_save.default.weight")]
+            partial_name = partial_name[partial_name.rfind(".")+1:]
+            print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {partial_name}.")
+            param_groups["embeddings"]    [name] = param
+        else:
+            param_groups["non_embeddings"][name] = param
+        pass
+    pass
+
+    optimizer_grouped_parameters = [
+        {
+            "params"       : list(param_groups["non_embeddings"].values()),
+            "weight_decay" : weight_decay,
+            "lr"           : lr,
+        },
+        {
+            "params"       : list(param_groups["embeddings"].values()),
+            "weight_decay" : weight_decay,
+            "lr"           : embedding_lr,
+        },
+    ]
+    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+    return optimizer
+pass
+
+
+class UnslothTrainer(SFTTrainer):
+    def create_optimizer(self):
+        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
+        if embedding_learning_rate is None: return super().create_optimizer()
+
+        if self.optimizer is None:
+            optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = _create_unsloth_optimizer(
+                self.model,
+                optimizer_cls,
+                optimizer_kwargs,
+                embedding_learning_rate,
+            )
+        pass
+        return self.optimizer
+    pass
+pass