Commit d3e0fa63 authored by chenzk's avatar chenzk
Browse files

v1.0.3

parents
Pipeline #1452 canceled with stages
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .llama import *
from .llama import (
LlamaRotaryEmbedding,
LlamaLinearScalingRotaryEmbedding,
)
from transformers.models.qwen2.modeling_qwen2 import (
Qwen2Attention,
Qwen2DecoderLayer,
Qwen2Model,
Qwen2ForCausalLM,
)
# For Pytorch 2.1.1
try:
from transformers.models.qwen2.modeling_qwen2 import (
Qwen2SdpaAttention,
Qwen2FlashAttention2,
)
except:
Qwen2SdpaAttention = Qwen2Attention
Qwen2FlashAttention2 = Qwen2Attention
pass
class FastQwen2Model(FastLlamaModel):
@staticmethod
def pre_patch():
init_name, function = patch_linear_scaling(
model_name = "qwen2",
rope_module = LlamaRotaryEmbedding,
scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
attention_module = Qwen2Attention,
)
if init_name is not None:
exec(function, globals())
Qwen2Attention.__init__ = eval(init_name)
pass
Qwen2Attention .forward = LlamaAttention_fast_forward
Qwen2SdpaAttention .forward = LlamaAttention_fast_forward
Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
Qwen2DecoderLayer .forward = LlamaDecoderLayer_fast_forward
Qwen2Model .forward = LlamaModel_fast_forward
Qwen2ForCausalLM .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
fix_prepare_inputs_for_generation(Qwen2ForCausalLM)
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
import transformers.models.qwen2.modeling_qwen2
transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding
return
pass
@staticmethod
def from_pretrained(
model_name = "Qwen/Qwen2-7B",
max_seq_length = 4096,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None, # Qwen2 does not support RoPE scaling
fix_tokenizer = True,
model_patcher = None,
tokenizer_name = None,
trust_remote_code = False,
**kwargs,
):
return FastLlamaModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
model_patcher = FastQwen2Model,
tokenizer_name = tokenizer_name,
trust_remote_code = trust_remote_code,
**kwargs,
)
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
from peft.tuners.lora import Linear4bit as Peft_Linear4bit
from peft.tuners.lora import Linear as Peft_Linear
from typing import Optional, Callable, Union, List
import torch
import os
import shutil
import pickle
import gc
from transformers.models.llama.modeling_llama import logger
from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias
import subprocess
import psutil
import re
from transformers.models.llama.modeling_llama import logger
from .tokenizer_utils import fix_sentencepiece_gguf
__all__ = [
"print_quantization_methods",
"unsloth_save_model",
"save_to_gguf",
"patch_saving_functions",
]
# Check environments
keynames = "\n" + "\n".join(os.environ.keys())
IS_COLAB_ENVIRONMENT = "\nCOLAB_" in keynames
IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
del keynames
# Weights
LLAMA_WEIGHTS = (
"self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj",
"mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
)
LLAMA_LAYERNORMS = (
"input_layernorm", "post_attention_layernorm",
"pre_feedforward_layernorm", "post_feedforward_layernorm",
)
# https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
# From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html
ALLOWED_QUANTS = \
{
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
"bf16" : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"f16" : "Float16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
"q4_0" : "Original quant method, 4-bit.",
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
"q4_k_s" : "Uses Q4_K for all tensors",
"q4_k" : "alias for q4_k_m",
"q5_k" : "alias for q5_k_m",
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
"q5_k_s" : "Uses Q5_K for all tensors",
"q6_k" : "Uses Q8_K for all tensors",
# "iq2_xxs" : "2.06 bpw quantization", # Not supported sadly
# "iq2_xs" : "2.31 bpw quantization",
# "iq3_xxs" : "3.06 bpw quantization",
"q3_k_xs" : "3-bit extra small quantization",
}
def print_quantization_methods():
for key, value in ALLOWED_QUANTS.items():
print(f'"{key}" ==> {value}')
pass
pass
def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"):
if not hasattr(model, "_saved_temp_tokenizer"): return False
temp_tokenizer = model._saved_temp_tokenizer
sentencepiece_model = False
file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path)
if not os.path.exists(file_location):
os.makedirs(file_location)
pass
temp_tokenizer.save_pretrained(file_location)
if os.path.isfile(f"{file_location}/tokenizer.model"):
sentencepiece_model = True
pass
shutil.rmtree(file_location, ignore_errors = True)
return sentencepiece_model
pass
def _free_cached_model(model):
from huggingface_hub import scan_cache_dir
cached_repos = list(scan_cache_dir().repos)
# Go through every cached repo, and delete the one that matches the model we want to save.
# Can save 4GB of disk space - useful for Kaggle systems.
for cached_repo in cached_repos:
if cached_repo.repo_id == model.config._name_or_path:
remove_cache_commit = list(cached_repo.revisions)[0].commit_hash
delete_strategy = scan_cache_dir().delete_revisions(remove_cache_commit,)
logger.warning_once(
"Unsloth: Will remove a cached repo with size " + \
delete_strategy.expected_freed_size_str,
)
delete_strategy.execute()
pass
pass
pass
def _merge_lora(layer, name):
bias = None
if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)):
# Is LoRA so we need to merge!
W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer)
if quant_state is not None:
dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2]
W = fast_dequantize(W, quant_state)
else:
dtype = W.dtype
W = W.to(torch.float32).t()
# W = W.t()
if A is not None:
# sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32)))
# W += sAB
W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s)
# W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s)
# if not torch.isfinite(W).all():
maximum_element = torch.max(W.min().abs(), W.max())
if not torch.isfinite(maximum_element).item():
raise ValueError(f"Unsloth: Merge failed.\n{name} has some elements = infinity.")
pass
W = W.t().to(dtype)
else:
W = layer.weight
return W, bias
pass
def fast_save_pickle(shard, name):
# Use this if # CPUs is <= 2
print(f"Unsloth: Saving {name}...")
torch.save(
shard,
name,
# HIGHEST_PROTOCOL seems to not work with Pytorch!
# pickle_module = pickle,
# pickle_protocol = pickle.HIGHEST_PROTOCOL,
)
return
pass
@torch.inference_mode
def unsloth_save_model(
model,
tokenizer,
save_directory : Union[str, os.PathLike],
save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
state_dict : Optional[dict] = None,
save_function : Callable = torch.save,
max_shard_size : Union[int, str] = "5GB",
safe_serialization : bool = True,
variant : Optional[str] = None,
save_peft_format : bool = True,
# Push to hub
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = "Trained with Unsloth",
private : Optional[bool] = None,
create_pr : bool = False,
revision : str = None,
commit_description : str = "Upload model trained with Unsloth 2x faster",
tags : List[str] = None,
# Our functions
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.9,
):
if token is None and "HF_TOKEN" in os.environ:
token = os.environ["HF_TOKEN"]
if token is None and "HUGGINGFACE_TOKEN" in os.environ:
token = os.environ["HUGGINGFACE_TOKEN"]
if commit_message is None: commit_message = ""
if "Unsloth" not in commit_message:
commit_message += " (Trained with Unsloth)"
commit_message = commit_message.lstrip()
if commit_description is None:
commit_description = "Upload model trained with Unsloth 2x faster"
elif "Unsloth 2x faster" not in commit_description:
commit_description += " (Trained with Unsloth 2x faster)"
pass
if save_method == "merged_4bit":
raise RuntimeError(
"Unsloth: Merging into 4bit will cause your model to lose accuracy if you plan\n"\
"to merge to GGUF or others later on. I suggest you to do this as a final step\n"\
"if you're planning to do multiple saves.\n"\
"If you are certain, change `save_method` to `merged_4bit_forced`."
)
elif save_method == "merged_4bit_forced":
save_method = "merged_4bit"
pass
save_pretrained_settings = dict(locals())
for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
del save_pretrained_settings[deletion]
pass
# First check for a token!
if push_to_hub:
from huggingface_hub import whoami
try:
username = whoami(token = token)["name"]
except:
raise RuntimeError(
"Unsloth: Please supply a token!\n"\
"Go to https://huggingface.co/settings/tokens"
)
pass
pass
assert(maximum_memory_usage > 0 and maximum_memory_usage <= 0.95)
# Clean memory up first
for _ in range(3):
torch.cuda.empty_cache()
gc.collect()
pass
save_method = save_method.lower().replace(" ", "_")
if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
raise RuntimeError(
"Unsloth: You must select one of 3 options when saving models:\n"\
'"lora" ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
'"merged_16bit" ==> This merges LoRA weights and saves to float16. Needed for llama.cpp / GGUF.\n'\
'"merged_4bit" ==> This merges LoRA weights and saves to 4bit. Useful for DPO / inference.'
)
pass
if save_method == "merged_4bit":
print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
print("This might take 5 minutes...")
# Counteract no LoRA adapters!
if hasattr(model, "merge_and_unload"):
model = model.merge_and_unload()
pass
print("Done.")
pass
if tags is not None:
assert(isinstance(tags, (list, tuple)))
tags = list(tags) + ["unsloth",]
else:
tags = ["unsloth",]
pass
save_pretrained_settings["tags"] = tags
if ((save_method == "lora") or (save_method == "merged_4bit")) and push_to_hub:
if token is None:
raise RuntimeError(
"Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
"Go to https://huggingface.co/settings/tokens."
)
pass
if save_method == "lora":
print("Unsloth: Saving LoRA adapters. Please wait...")
elif save_method == "merged_4bit":
print("Unsloth: Saving 4bit Bitsandbytes model. Please wait...")
pass
# Update model tag
_ = upload_to_huggingface(
model, save_directory, token,
"finetuned", "trl", file_location = None,
old_username = None, private = private,
)
getattr(model, "original_push_to_hub", tokenizer.push_to_hub)\
(
repo_id = save_directory,
use_temp_dir = use_temp_dir,
commit_message = commit_message,
private = private,
token = token,
max_shard_size = max_shard_size,
create_pr = create_pr,
safe_serialization = safe_serialization,
revision = revision,
commit_description = commit_description,
tags = tags,
)
if tokenizer is not None:
# Set padding side to left for inference
old_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
getattr(tokenizer, "original_push_to_hub", tokenizer.push_to_hub)\
(
repo_id = save_directory,
use_temp_dir = use_temp_dir,
commit_message = commit_message,
private = private,
token = token,
max_shard_size = max_shard_size,
create_pr = create_pr,
safe_serialization = safe_serialization,
revision = revision,
commit_description = commit_description,
tags = tags,
)
# Revert back padding side
tokenizer.padding_side = old_padding_side
pass
if hasattr(model, "config"):
print(f"Saved {save_method} model to https://huggingface.co/" + save_directory)
pass
return save_directory, None
pass
# Tokenizer has different saving arguments
tokenizer_save_settings = \
{
"save_directory" : save_pretrained_settings["save_directory"],
"legacy_format" : None,
"filename_prefix" : None,
"push_to_hub" : save_pretrained_settings["push_to_hub"],
"private" : save_pretrained_settings["private"],
"token" : save_pretrained_settings["token"],
}
# Check if PEFT Model or not - if yes, 3 levels. If not 2 levels.
from peft import PeftModelForCausalLM
if isinstance(model, PeftModelForCausalLM):
internal_model = model.model
else:
internal_model = model
pass
# Cannot be converted properly!
if (save_method == "merged_4bit") or (save_method == "lora") or (
not hasattr(model, "model") or \
not hasattr(internal_model.model, "layers")
):
# Do general saving
# Edit save_pretrained_settings
# [TODO] _create_repo has errors due to **kwargs getting accepted
# commit_description does not seem to work?
what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
if save_pretrained_settings["push_to_hub"] is False else \
("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
for deletion in what_to_delete:
del save_pretrained_settings[deletion]
pass
if hasattr(model, "add_model_tags"):
model.add_model_tags(["unsloth",])
# Update model tag
if push_to_hub:
_ = upload_to_huggingface(
model, save_pretrained_settings["save_directory"], token,
"finetuned", "trl", file_location = None,
old_username = None, private = private,
)
pass
if tokenizer is not None:
print("Unsloth: Saving tokenizer...", end = "")
# Set padding side to left for inference
old_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
tokenizer.save_pretrained(**tokenizer_save_settings)
# Revert back padding side
tokenizer.padding_side = old_padding_side
print(" Done.")
else:
print()
print("Unsloth: Saving model...", end = "")
if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
# [TODO] Is this correct?
if save_method == "lora":
save_pretrained_settings["selected_adapters"] = None
pass
model.save_pretrained(**save_pretrained_settings)
if push_to_hub and hasattr(model, "config"):
print("Saved to https://huggingface.co/" + save_pretrained_settings["save_directory"])
pass
print(" Done.")
return save_directory, None
pass
# If push_to_hub, we must remove the .../ part of a repo
username = None
if push_to_hub and "/" in save_directory:
# +1 solves absolute path issues
username = save_directory[:save_directory.find("/")]
new_save_directory = save_directory[save_directory.find("/")+1:]
logger.warning_once(
f"Unsloth: You are pushing to hub, but you passed your HF username = {username}.\n"\
f"We shall truncate {save_directory} to {new_save_directory}"
)
save_pretrained_settings["save_directory"] = new_save_directory
tokenizer_save_settings ["save_directory"] = new_save_directory
save_directory = new_save_directory
pass
print("Unsloth: Merging 4bit and LoRA weights to 16bit...")
# Determine max RAM usage minus sharding
max_ram = psutil.virtual_memory().available
sharded_ram_usage = 5 * 1024 * 1024 * 1024
if type(max_shard_size) is str:
gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
if gb_found: sharded_ram_usage = int(gb_found.group(1)) * 1024 * 1024 * 1024
elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024
elif type(max_shard_size) is int:
sharded_ram_usage = sharded_ram_usage
pass
# Switch to our fast saving modules if it's a slow PC!
n_cpus = psutil.cpu_count(logical = False)
if n_cpus is None: n_cpus = psutil.cpu_count()
if n_cpus is None: n_cpus = 1
if safe_serialization is None:
safe_serialization = True
save_pretrained_settings["safe_serialization"] = safe_serialization
elif safe_serialization and (n_cpus <= 2):
logger.warning_once(
f"Unsloth: You have {n_cpus} CPUs. Using `safe_serialization` is 10x slower.\n"\
f"We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.\n"\
f"To force `safe_serialization`, set it to `None` instead.",
)
safe_serialization = False
save_function = fast_save_pickle
save_pretrained_settings["safe_serialization"] = safe_serialization
save_pretrained_settings["save_function"] = save_function
pass
# Only safe_serialization uses more RAM
if safe_serialization:
max_ram -= sharded_ram_usage
else:
max_ram -= sharded_ram_usage*0.25 # Uses much less
pass
max_ram = int(max(0, max_ram) * maximum_memory_usage)
print(f"Unsloth: Will use up to "\
f"{round(max_ram/1024/1024/1024, 2)} out of "\
f"{round(psutil.virtual_memory().total/1024/1024/1024, 2)} RAM for saving.")
# Max directory for disk saving
if not os.path.exists(temporary_location):
os.makedirs(temporary_location)
pass
# Check if Kaggle or Colab, since only 20GB of Disk space allowed.
if IS_KAGGLE_ENVIRONMENT or IS_COLAB_ENVIRONMENT:
# We free up 4GB of space
logger.warning_once(
"Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n"\
"model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab."
)
_free_cached_model(internal_model)
pass
# HF also uses a OrderedDict
from collections import OrderedDict
state_dict = OrderedDict()
torch_dtype = internal_model.config.torch_dtype
if type(torch_dtype) is str:
if torch_dtype == "float16": torch_dtype = torch.float16
elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16
pass
# Check modules to save float32 dtype
state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype)
max_vram = int(torch.cuda.get_device_properties(0).total_memory * maximum_memory_usage)
from tqdm import tqdm as ProgressBar
for j, layer in enumerate(ProgressBar(internal_model.model.layers)):
for item in LLAMA_WEIGHTS:
proj = eval(f"layer.{item}")
name = f"model.layers.{j}.{item}.weight"
W, bias = _merge_lora(proj, name)
# Bias term
if bias is not None:
state_dict[f"model.layers.{j}.{item}.bias"] = bias
pass
if (torch.cuda.memory_allocated() + W.nbytes) < max_vram:
# Save to GPU memory
state_dict[name] = W
# [TODO] Saving to RAM seems to leak memory???
# elif (max_ram - W.nbytes) > 0:
# # Save to CPU memory
# logger.warning_once(f"We will save to RAM and not VRAM now.")
# state_dict[name] = W.to("cpu", non_blocking = True, copy = True)
# max_ram = max(max_ram - W.nbytes, 0)
else:
# Save to Disk
logger.warning_once(f"We will save to Disk and not RAM now.")
filename = os.path.join(temporary_location, f"{name}.pt")
torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True)
pass
for item in LLAMA_LAYERNORMS:
try:
# Skip for Gemma 2
state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data")
except:
continue
pass
pass
state_dict["model.norm.weight"] = internal_model.model.norm.weight.data
# Check for modules_to_save float32 dtype
# Check for tied weights
if internal_model.model.embed_tokens.weight.data_ptr() != internal_model.lm_head.weight.data_ptr():
state_dict["lm_head.weight"] = internal_model.lm_head.weight.data.to(torch_dtype)
pass
# All tensors MUST be type torch.Tensor and not torch.nn.parameter.Parameter
for key, value in state_dict.items():
if hasattr(value, "data"): state_dict[key] = value = value.data
if type(value) is not torch.Tensor:
logger.warning_once(f"Unsloth: {key} is not a Tensor but a {type(value)}.")
pass
pass
# Edit save_pretrained_settings
# [TODO] _create_repo has errors due to **kwargs getting accepted
save_pretrained_settings["state_dict"] = state_dict
# commit_description does not seem to work?
what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
if not push_to_hub else \
("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
for deletion in what_to_delete:
del save_pretrained_settings[deletion]
pass
if hasattr(model, "add_model_tags"):
model.add_model_tags(["unsloth",])
# Update model tag
if push_to_hub:
_ = upload_to_huggingface(
model, save_pretrained_settings["save_directory"], token,
"finetuned", "trl", file_location = None,
old_username = username, private = private,
)
pass
# First check if we're pushing to an organization!
save_directory = save_pretrained_settings["save_directory"]
if save_pretrained_settings["push_to_hub"]:
new_save_directory, new_username = _determine_username(save_directory, username, token)
if token is not None:
from huggingface_hub import whoami
actual_username = whoami(token = token)["name"]
else:
actual_username = username
pass
# Check if pushing to an organization
if save_pretrained_settings["push_to_hub"] and (username != actual_username):
print(f"Unsloth: Saving to organization with address {new_save_directory}")
# We upload everything at the end!
tokenizer_save_settings["push_to_hub"] = False
tokenizer_save_settings["save_directory"] = new_save_directory
pass
# Save tokenizer
if tokenizer is not None:
print("Unsloth: Saving tokenizer...", end = "")
# Set padding side to left for inference
old_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
tokenizer.save_pretrained(**tokenizer_save_settings)
# Revert back padding side
tokenizer.padding_side = old_padding_side
print(" Done.")
else:
print()
pass
print("Unsloth: Saving model... This might take 5 minutes for Llama-7b...")
# Since merged, edit quantization_config
old_config = model.config
new_config = model.config.to_dict()
if "quantization_config" in new_config:
del new_config["quantization_config"]
original_model = model
new_config = type(model.config).from_dict(new_config)
while hasattr(original_model, "model"):
original_model = original_model.model
original_model.config = new_config
model.config = new_config
# Save!
# [TODO] --> is this correct?
# save_pretrained_settings["selected_adapters"] = None
# Check if pushing to an organization
if save_pretrained_settings["push_to_hub"] and (username != actual_username):
print(f"Unsloth: Saving to organization with address {new_save_directory}")
# Pushing to organization!
# Sadly .save_pretrained doesn't work :(
# We first save it via .save_pretrained, then upload manually!
save_pretrained_settings["save_directory"] = new_save_directory
save_pretrained_settings["push_to_hub"] = False
internal_model.save_pretrained(**save_pretrained_settings)
# Now manually go through each file and upload them manually!
filenames = os.listdir(new_save_directory)
from huggingface_hub import HfApi
hf_api = HfApi(token = save_pretrained_settings["token"])
print("Unsloth: Uploading all files... Please wait...")
hf_api.upload_folder(
folder_path = new_save_directory,
path_in_repo = ".",
repo_id = new_save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
ignore_patterns = "*.md",
)
else:
internal_model.save_pretrained(**save_pretrained_settings)
pass
# Revert config back
original_model = model
while hasattr(original_model, "model"):
original_model = original_model.model
original_model.config = old_config
model.config = old_config
print("Done.")
if push_to_hub and hasattr(model, "config"):
print(f"Saved merged model to https://huggingface.co/{username}/{save_directory.lstrip('/')}")
pass
save_pretrained_settings["state_dict"] = None
for j, (key, value) in enumerate(state_dict.items()):
state_dict[key] = None
if j % 10 == 0:
torch.cuda.empty_cache()
gc.collect()
pass
pass
state_dict = None
del state_dict
torch.cuda.empty_cache()
gc.collect()
# Remove temporary location
import shutil
shutil.rmtree(temporary_location, ignore_errors = True)
for _ in range(3):
torch.cuda.empty_cache()
gc.collect()
return save_directory, username
pass
def install_llama_cpp_clone_non_blocking():
full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"]
run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
return run_installer
pass
def install_llama_cpp_make_non_blocking():
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# env = { **os.environ, "LLAMA_CUDA": "1", }
n_jobs = max(int(psutil.cpu_count()*1.5), 1)
# Force make clean
os.system("make clean -C llama.cpp")
full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"]
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
return run_installer
pass
def install_python_non_blocking(packages = []):
full_command = ["pip", "install"] + packages
run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
return run_installer
pass
def install_llama_cpp_old(version = -10):
# Download the 10th latest release since the latest might be broken!
# FALLBACK mechanism
releases = subprocess.check_output(["git", "ls-remote", "--tags", "https://github.com/ggerganov/llama.cpp.git"])
releases = releases.decode("utf-8").replace("\t", " ").split("\n")
for i, x in enumerate(releases):
if "refs/tags/b" not in x: break
releases = releases[:i]
latest = releases[-1]
version = releases[version].split(" ")[0]
# Check if the llama.cpp exists
if os.path.exists("llama.cpp"):
print(
"**[WARNING]** You have a llama.cpp old directory which is broken.\n"\
"Unsloth will DELETE the broken directory and install a new one.\n"\
"Press CTRL + C / cancel this if this is wrong. We shall wait 10 seconds.\n"
)
import time
for i in range(10):
print(f"**[WARNING]** Deleting llama.cpp directory... {10-i} seconds left.")
time.sleep(1)
import shutil
shutil.rmtree("llama.cpp", ignore_errors = True)
pass
# Clone a specific commit
# Also don't use the GPU!
commands = [
"git clone --recursive https://github.com/ggerganov/llama.cpp",
f"cd llama.cpp && git reset --hard {version} && git clean -df",
"make clean -C llama.cpp",
f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
]
for command in commands:
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
pass
pass
# Check if successful
if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"):
raise RuntimeError(
"Unsloth: The file 'llama.cpp/llama-quantize' or `llama.cpp/quantize` does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
)
pass
pass
def install_llama_cpp_blocking(use_cuda = False):
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
commands = [
"git clone --recursive https://github.com/ggerganov/llama.cpp",
"make clean -C llama.cpp",
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
"pip install gguf protobuf",
]
if os.path.exists("llama.cpp"): return
for command in commands:
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
pass
pass
pass
def get_executable(executables):
# Get system locations (System Path).split(system separator)
system_directories = os.environ.get("PATH").split(os.pathsep)
for directory in system_directories:
for executable in executables:
path = os.path.join(directory, executable)
# Check if the executable exists and is executable
if os.path.exists(path) and os.access(path, os.X_OK): return path
pass
pass
return None
pass
def save_to_gguf(
model_type : str,
model_dtype : str,
is_sentencepiece : bool = False,
model_directory : str = "unsloth_finetuned_model",
quantization_method = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"]
first_conversion : str = None,
_run_installer = None, # Non blocking install of llama.cpp
):
# logger.warning(
# "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
# "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
# "Please be patient - GGUF saving should still work, but might not work as well."
# )
assert(model_dtype == "float16" or model_dtype == "bfloat16")
model_dtype = "f16" if model_dtype == "float16" else "bf16"
# Convert quantization_method to list
if isinstance(quantization_method, list): pass
elif isinstance(quantization_method, str): quantization_method = [ quantization_method, ]
elif isinstance(quantization_method, tuple): quantization_method = list(quantization_method)
else:
raise TypeError("Unsloth: quantization_method can only be a string or a list of strings")
pass
# Check if bfloat16 is supported
if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
logger.warning(
"Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
"We shall switch instead to f16."
)
model_dtype = "f16"
pass
# Check first_conversion as well
if first_conversion is None:
first_conversion = model_dtype
pass
# Check I quants
for quant_method in quantization_method:
if quant_method.startswith("iq2"):
raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
pass
# Careful convert.py is only for Llama / Mistral based archs
use_fast_convert = False
if not is_sentencepiece: use_fast_convert = False # Llama-3
elif model_type == "llama": use_fast_convert = True
elif model_type == "mistral": use_fast_convert = True
pass
logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
# Map quant methods
new_quantization_method = []
for quant_method in quantization_method:
if quant_method == "not_quantized": quant_method = model_dtype
elif quant_method == "fast_quantized": quant_method = "q8_0"
elif quant_method == "quantized": quant_method = "q4_k_m"
elif quant_method is None: quant_method = "q8_0"
# Check if wrong method
if quant_method not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n"
for key, value in ALLOWED_QUANTS.items():
error += f"[{key}] => {value}\n"
raise RuntimeError(error)
pass
new_quantization_method.append(quant_method)
pass
quantization_method = new_quantization_method
print_info = \
f"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n"\
f" \\\ /| [0] Installing llama.cpp will take 3 minutes.\n"\
f"O^O/ \_/ \\ [1] Converting HF to GGUF 16bits will take 3 minutes.\n"\
f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\
f' "-____-" In total, you will have to wait at least 16 minutes.\n'
print(print_info)
# Check first_conversion format
if first_conversion == "f16" : pass
elif first_conversion == "bf16" : pass
elif first_conversion == "f32" : pass
elif first_conversion == "q8_0" : pass
else:
raise RuntimeError(
f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`."
)
pass
# Determine whether the system already has llama.cpp installed and the scripts are executable
quantize_location = get_executable(["llama-quantize", "quantize"])
convert_location = get_executable(["convert-hf-to-gguf.py", "convert_hf_to_gguf.py"])
if quantize_location is not None and convert_location is not None:
print("Unsloth: llama.cpp found in the system. We shall skip installation.")
else:
print("Unsloth: [0] Installing llama.cpp. This will take 3 minutes...")
if _run_installer is not None:
error = _run_installer.wait()
else:
error = 0
install_llama_cpp_blocking()
pass
# Check if successful. If not install 10th latest release
# Careful llama.cpp/quantize changed to llama.cpp/llama-quantize
# and llama.cpp/main changed to llama.cpp/llama-cli
# See https://github.com/ggerganov/llama.cpp/pull/7809
quantize_location = None
if os.path.exists("llama.cpp/quantize"):
quantize_location = "llama.cpp/quantize"
elif os.path.exists("llama.cpp/llama-quantize"):
quantize_location = "llama.cpp/llama-quantize"
else:
raise RuntimeError(
"Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
)
pass
# See https://github.com/unslothai/unsloth/pull/730
# Filenames changed again!
convert_location = None
if os.path.exists("llama.cpp/convert-hf-to-gguf.py"):
convert_location = "llama.cpp/convert-hf-to-gguf.py"
elif os.path.exists("llama.cpp/convert_hf_to_gguf.py"):
convert_location = "llama.cpp/convert_hf_to_gguf.py"
else:
raise RuntimeError(
"Unsloth: The file 'llama.cpp/convert-hf-to-gguf.py' or 'llama.cpp/convert_hf_to_gguf.py' does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
)
pass
if error != 0 or quantize_location is None or convert_location is None:
print(f"Unsloth: llama.cpp error code = {error}.")
install_llama_cpp_old(-10)
pass
pass
# Determine maximum first_conversion state
if first_conversion == "f32" : strength = 3
elif first_conversion == "f16" : strength = 2
elif first_conversion == "bf16" : strength = 1
elif first_conversion == "q8_0" : strength = 0
for quant_method in quantization_method:
if quant_method == "f32": strength = max(strength, 3)
elif quant_method == "f16": strength = max(strength, 2)
elif quant_method == "bf16": strength = max(strength, 1)
elif quant_method == "q8_0": strength = max(strength, 0)
else:
# Quantized models must have f16 as the default argument
if first_conversion == "f32" : pass
elif first_conversion == "f16" : pass
elif first_conversion == "bf16" : pass
elif first_conversion == "q8_0":
logger.warning_once(
"Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
"but saves disk space!"
)
# first_conversion = "f16"
pass
pass
pass
# If only q8_0:
if len(quantization_method) == 1 and quantization_method[0] == "q8_0":
strength = 0
pass
if strength >= 3: first_conversion = "f32"
elif strength >= 2: first_conversion = "f16"
elif strength >= 1: first_conversion = "bf16"
else: first_conversion = "q8_0"
# Non llama/mistral needs can only use f32 or f16
if not use_fast_convert and \
(first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
pass
# Latest llama.cpp works for all models for q8_0!
# logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.")
# first_conversion = "f16"
pass
# Check if bfloat16 is supported
if first_conversion == "bf16" and not torch.cuda.is_bf16_supported():
logger.warning(
"Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
"We shall switch instead to f16."
)
first_conversion = "f16"
pass
n_cpus = psutil.cpu_count()
if n_cpus is None: n_cpus = 1
n_cpus *= 2
# Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
final_location = f"./{model_directory}/unsloth.{first_conversion.upper()}.gguf"
print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\
f"The output location will be {final_location}\n"\
"This will take 3 minutes...")
# We first check if tokenizer.model exists in the model_directory
if os.path.exists(f"{model_directory}/tokenizer.model"):
vocab_type = "spm,hfft,bpe"
# Fix Sentencepiece model as well!
fix_sentencepiece_gguf(model_directory)
else:
vocab_type = "bpe"
pass
# convert.py is deprecated!
use_fast_convert = False
if use_fast_convert:
command = f"python llama.cpp/convert.py {model_directory} "\
f"--outfile {final_location} --vocab-type {vocab_type} "\
f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
else:
command = f"python {convert_location} {model_directory} "\
f"--outfile {final_location} "\
f"--outtype {first_conversion}"
pass
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
if sp.returncode is not None and sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, sp.args)
pass
# Check if quantization succeeded!
if not os.path.isfile(final_location):
if IS_KAGGLE_ENVIRONMENT:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You are in a Kaggle environment, which might be the reason this is failing.\n"\
"Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
"This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
"`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
"I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
)
else:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You might have to compile llama.cpp yourself, then run this again.\n"\
"You do not need to close this Python program. Run the following commands in a new terminal:\n"\
"You must run this in the same folder as you're saving your model.\n"\
"git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
"cd llama.cpp && make clean && make all -j\n"\
"Once that's done, redo the quantization."
)
pass
pass
print(f"Unsloth: Conversion completed! Output location: {final_location}")
full_precision_location = final_location
all_saved_locations = [full_precision_location,]
# Convert each type!
for quant_method in quantization_method:
if quant_method != first_conversion:
print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...")
final_location = f"./{model_directory}/unsloth.{quant_method.upper()}.gguf"
command = f"./{quantize_location} {full_precision_location} "\
f"{final_location} {quant_method} {n_cpus}"
# quantize uses stderr
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
if sp.returncode is not None and sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, sp.args)
pass
# Check if quantization succeeded!
if not os.path.isfile(final_location):
if IS_KAGGLE_ENVIRONMENT:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You are in a Kaggle environment, which might be the reason this is failing.\n"\
"Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
"This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
"`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
"I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
)
else:
raise RuntimeError(
"Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
"You do not need to close this Python program. Run the following commands in a new terminal:\n"\
"You must run this in the same folder as you're saving your model.\n"\
"git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
"cd llama.cpp && make clean && make all -j\n"\
"Once that's done, redo the quantization."
)
pass
pass
print(f"Unsloth: Conversion completed! Output location: {final_location}")
all_saved_locations.append(final_location)
pass
pass
return all_saved_locations
pass
def unsloth_save_pretrained_merged(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
state_dict : Optional[dict] = None,
save_function : Callable = torch.save,
max_shard_size : Union[int, str] = "5GB",
safe_serialization : bool = True,
variant : Optional[str] = None,
save_peft_format : bool = True,
tags : List[str] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.75,
):
"""
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `save_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
"""
if tokenizer is None:
logger.warning_once(
"Unsloth: You're not saving a tokenizer as well?\n"\
"You can do it separately via `tokenizer.save_pretrained(...)`"
)
pass
arguments = dict(locals())
arguments["model"] = self
del arguments["self"]
unsloth_save_model(**arguments)
for _ in range(3):
gc.collect()
pass
def unsloth_push_to_hub_merged(
self,
repo_id : str,
tokenizer = None,
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = "Trained with Unsloth",
private : Optional[bool] = None,
token : Union[bool, str, None] = None,
max_shard_size : Union[int, str, None] = "5GB",
create_pr : bool = False,
safe_serialization : bool = True,
revision : str = None,
commit_description : str = "Upload model trained with Unsloth 2x faster",
tags : Optional[List[str]] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.75,
):
"""
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `save_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
"""
if tokenizer is None:
logger.warning_once(
"Unsloth: You're not saving a tokenizer as well?\n"\
"You can do it separately via `tokenizer.push_to_hub(...)`"
)
pass
arguments = dict(locals())
arguments["model"] = self
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = True
del arguments["self"]
del arguments["repo_id"]
unsloth_save_model(**arguments)
for _ in range(3):
gc.collect()
pass
MODEL_CARD = \
"""---
base_model: {base_model}
tags:
- text-generation-inference
- transformers
- unsloth
- {model_type}
- {extra}
license: apache-2.0
language:
- en
---
# Uploaded {method} model
- **Developed by:** {username}
- **License:** apache-2.0
- **Finetuned from model :** {base_model}
This {model_type} model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
"""
def _determine_username(save_directory, old_username, token):
username = ""
save_directory = save_directory.lstrip("./")
if "/" not in save_directory:
from huggingface_hub import whoami
try:
username = whoami(token = token)["name"]
if type(old_username) is str and username != old_username:
username = old_username
pass
save_directory = f"{username}/{save_directory}"
except:
raise RuntimeError(f"Unsloth: {save_directory} is not a Huggingface directory.")
else:
username = save_directory.split("/")[0]
pass
return save_directory, username
pass
def upload_to_huggingface(
model,
save_directory,
token,
method,
extra = "",
file_location = None,
old_username = None,
private = None,
):
save_directory, username = _determine_username(save_directory, old_username, token)
from huggingface_hub import create_repo
try:
create_repo(
repo_id = save_directory,
token = token,
repo_type = "model",
exist_ok = False,
private = private,
)
# Create model card
from huggingface_hub import ModelCard
content = MODEL_CARD.format(
username = username,
base_model = model.config._name_or_path,
model_type = model.config.model_type,
method = "",
extra = extra,
)
card = ModelCard(content)
card.push_to_hub(save_directory, token = token)
except:
pass
if file_location is not None:
# Now upload file
from huggingface_hub import HfApi
hf_api = HfApi(token = token)
if "/" in file_location:
uploaded_location = file_location[file_location.rfind("/")+1:]
else:
uploaded_location = file_location
pass
# find ftevent file from tensorboard and upload it
import glob
ftevent_files = glob.glob("*out.tfevents*", recursive = True)
if len(ftevent_files) > 0:
print("Unsloth: Uploading tensorboard files... Please wait...", file_location + "*out.tfevents*")
for ftevent_file in ftevent_files:
hf_api.upload_file(
path_or_fileobj = ftevent_file,
path_in_repo = ftevent_file.replace(file_location, ""),
repo_id = save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
)
hf_api.upload_file(
path_or_fileobj = file_location,
path_in_repo = uploaded_location,
repo_id = save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
)
# We also upload a config.json file
import json
with open("_temporary_unsloth_config.json", "w") as file:
json.dump({"model_type" : model.config.model_type}, file, indent = 4)
pass
hf_api.upload_file(
path_or_fileobj = "_temporary_unsloth_config.json",
path_in_repo = "config.json",
repo_id = save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
)
os.remove("_temporary_unsloth_config.json")
pass
return username
pass
def fix_tokenizer_bos_token(tokenizer):
# Check if BOS added already, then warn
fix_bos_token = False
chat_template = getattr(tokenizer, "chat_template", None)
if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
if chat_template is not None and \
(
tokenizer.bos_token in chat_template or \
"{bos_token}" in chat_template.replace(" ", "") or \
"{bos_token+" in chat_template.replace(" ", "")
):
fix_bos_token = True
logger.warning(
f"Unsloth: ##### The current model auto adds a BOS token.\n"\
"Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
)
# Remove {{bos_token}}
new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
# Remove {{bos_token +
new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template)
tokenizer.chat_template = new_chat_template
pass
pass
return fix_bos_token, chat_template
pass
def create_ollama_modelfile(tokenizer, gguf_location):
"""
Creates an Ollama Modelfile.
Use ollama.create(model = "new_ollama_model", modelfile = modelfile)
"""
modelfile = getattr(tokenizer, "_ollama_modelfile", None)
if modelfile is None: return None
modelfile = modelfile\
.replace("{{", "⚫@✅#🦥")\
.replace("}}", "⚡@🦥#⛵")\
.format(
__FILE_LOCATION__ = gguf_location,
)\
.replace("⚫@✅#🦥", "{{")\
.replace("⚡@🦥#⛵", "}}")\
.rstrip()
pass
return modelfile
pass
def unsloth_save_pretrained_gguf(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
quantization_method : str = "fast_quantized",
first_conversion : str = None,
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
private : Optional[bool] = None,
is_main_process : bool = True,
state_dict : Optional[dict] = None,
save_function : Callable = torch.save,
max_shard_size : Union[int, str] = "5GB",
safe_serialization : bool = True,
variant : Optional[str] = None,
save_peft_format : bool = True,
tags : List[str] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.85,
):
"""
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 then converted to GGUF / llama.cpp format.
Choose for `quantization_method` to be:
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
"f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
"q4_0" : "Original quant method, 4-bit.",
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
"q4_k_s" : "Uses Q4_K for all tensors",
"q4_k" : "alias for q4_k_m",
"q5_k" : "alias for q5_k_m",
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
"q5_k_s" : "Uses Q5_K for all tensors",
"q6_k" : "Uses Q8_K for all tensors",
"iq2_xxs" : "2.06 bpw quantization",
"iq2_xs" : "2.31 bpw quantization",
"iq3_xxs" : "3.06 bpw quantization",
"q3_k_xs" : "3-bit extra small quantization",
"""
if tokenizer is None:
raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["push_to_hub"] = False # We save ourselves
arguments["save_method"] = "merged_16bit" # Must be 16bit
del arguments["self"]
del arguments["quantization_method"]
del arguments["first_conversion"]
# Fix tokenizer adding an extra BOS token at the front
fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
# Non blocking install GGUF first
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
else:
try:
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
except:
# Retry by recloning llama.cpp
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
pass
pass
# Use old chat template if the bos is removed
if fix_bos_token:
tokenizer.chat_template = old_chat_template
pass
for _ in range(3):
gc.collect()
model_dtype = self.config.torch_dtype
model_type = self.config.model_type
if type(model_dtype) is str:
assert(model_dtype == "float16" or model_dtype == "bfloat16")
elif model_dtype == torch.float16:
model_dtype = "float16"
elif model_dtype == torch.bfloat16:
model_dtype = "bfloat16"
else:
raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
pass
is_sentencepiece_model = check_if_sentencepiece_model(self)
# Save to GGUF
all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)
# Save Ollama modelfile
modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
modelfile_location = None
if modelfile is not None:
modelfile_location = os.path.join(new_save_directory, "Modelfile")
with open(modelfile_location, "w") as file:
file.write(modelfile)
pass
print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
pass
if fix_bos_token:
logger.warning(
f"Unsloth: ##### The current model auto adds a BOS token.\n"\
"Unsloth: ##### We removed it in GGUF's chat template for you."
)
pass
if push_to_hub:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
for file_location in all_file_locations:
username = upload_to_huggingface(
self, save_directory, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
print(f"Saved GGUF to https://huggingface.co/{link}")
pass
# Save modelfile
if modelfile_location is not None:
username = upload_to_huggingface(
self, save_directory, token,
"GGUF converted", "gguf", modelfile_location, old_username, private,
)
print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
pass
pass
pass
def unsloth_push_to_hub_gguf(
self,
repo_id : str,
tokenizer = None,
quantization_method : str = "fast_quantized",
first_conversion : str = None,
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = "Trained with Unsloth",
private : Optional[bool] = None,
token : Union[bool, str, None] = None,
max_shard_size : Union[int, str, None] = "5GB",
create_pr : bool = False,
safe_serialization : bool = True,
revision : str = None,
commit_description : str = "Upload model trained with Unsloth 2x faster",
tags : Optional[List[str]] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.85,
):
"""
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 then converted to GGUF / llama.cpp format.
Choose for `quantization_method` to be:
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
"f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
"q4_0" : "Original quant method, 4-bit.",
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
"q4_k_s" : "Uses Q4_K for all tensors",
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
"q5_k_s" : "Uses Q5_K for all tensors",
"q6_k" : "Uses Q8_K for all tensors",
"""
if tokenizer is None:
raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = False # We save ourselves
arguments["save_method"] = "merged_16bit" # Must be 16bit
del arguments["self"]
del arguments["repo_id"]
del arguments["quantization_method"]
del arguments["first_conversion"]
# Fix tokenizer adding an extra BOS token at the front
fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
# Non blocking install GGUF first
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
else:
try:
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
except:
# Retry by recloning llama.cpp
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
pass
pass
# Use old chat template if the bos is removed
if fix_bos_token:
tokenizer.chat_template = old_chat_template
pass
for _ in range(3):
gc.collect()
model_dtype = self.config.torch_dtype
model_type = self.config.model_type
if type(model_dtype) is str:
assert(model_dtype == "float16" or model_dtype == "bfloat16")
elif model_dtype == torch.float16:
model_dtype = "float16"
elif model_dtype == torch.bfloat16:
model_dtype = "bfloat16"
else:
raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
pass
is_sentencepiece_model = check_if_sentencepiece_model(self)
# Save to GGUF
all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)
# Save Ollama modelfile
modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
modelfile_location = None
if modelfile is not None:
modelfile_location = os.path.join(new_save_directory, "Modelfile")
with open(modelfile_location, "w") as file:
file.write(modelfile)
pass
print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
pass
for file_location in all_file_locations:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
username = upload_to_huggingface(
self, repo_id, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
print(f"Saved GGUF to https://huggingface.co/{link}")
pass
# Save modelfile
if modelfile_location is not None:
username = upload_to_huggingface(
self, repo_id, token,
"GGUF converted", "gguf", modelfile_location, old_username, private,
)
print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
pass
if fix_bos_token:
logger.warning(
f"Unsloth: ##### The current model auto adds a BOS token.\n"\
"Unsloth: ##### We removed it in GGUF's chat template for you."
)
pass
pass
# Corrected function to save LoRA to a custom directory
def save_lora_to_custom_dir(model, tokenizer, save_directory):
# Create the custom directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)
# Call the unsloth_save_model function with the custom directory
unsloth_save_model(
model,
tokenizer,
save_directory=save_directory,
save_method="lora",
push_to_hub=False,
)
# Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub
def unsloth_convert_lora_to_ggml_and_push_to_hub(
self,
tokenizer,
repo_id: str,
use_temp_dir: Optional[bool] = None,
commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth",
private: Optional[bool] = None,
token: Union[bool, str, None] = None,
create_pr: bool = False,
revision: str = None,
commit_description: str = "Convert LoRA to GGML format using Unsloth",
temporary_location: str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage: float = 0.85,
):
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
python_install = install_python_non_blocking(["protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda=False)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
python_install.wait()
else:
makefile = None
for _ in range(3):
gc.collect()
lora_directory_push = "lora-to-ggml-push"
save_lora_to_custom_dir(self, tokenizer, lora_directory_push)
model_type = self.config.model_type
output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin")
print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.")
print(f"The output file will be {output_file}")
command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama"
try:
with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
for line in sp.stdout:
print(line, end="", flush=True)
for line in sp.stderr:
print(line, end="", flush=True)
sp.wait()
if sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, command)
except subprocess.CalledProcessError as e:
print(f"Error: Conversion failed with return code {e.returncode}")
return
print(f"Unsloth: Conversion completed! Output file: {output_file}")
print("Unsloth: Uploading GGML file to Hugging Face Hub...")
username = upload_to_huggingface(
self, repo_id, token,
"GGML converted LoRA", "ggml", output_file, None, private,
)
link = f"{repo_id.lstrip('/')}"
print("Unsloth: Done.")
print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}")
print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
def unsloth_convert_lora_to_ggml_and_save_locally(
self,
save_directory: str, # Added parameter for the folder name
tokenizer,
temporary_location: str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage: float = 0.85,
):
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
python_install = install_python_non_blocking(["protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda=False)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
python_install.wait()
else:
makefile = None
for _ in range(3):
gc.collect()
# Use the provided save_directory for local saving
save_lora_to_custom_dir(self, tokenizer, save_directory)
model_type = self.config.model_type
output_file = os.path.join(save_directory, "ggml-adapter-model.bin")
print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.")
print(f"The output file will be {output_file}")
command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama"
try:
with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
for line in sp.stdout:
print(line, end="", flush=True)
for line in sp.stderr:
print(line, end="", flush=True)
sp.wait()
if sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, command)
except subprocess.CalledProcessError as e:
print(f"Error: Conversion failed with return code {e.returncode}")
return
print("Unsloth: Done.")
print(f"Unsloth: Conversion completed! Output file: {output_file}")
print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
def patch_saving_functions(model):
import inspect
import types
from typing import Callable, Optional, Union, List
# And now re add our saving methods!
if model.push_to_hub.__name__ == "unsloth_push_to_hub":
original_push_to_hub = model.original_push_to_hub
else:
original_push_to_hub = model.push_to_hub
pass
signature = str(inspect.signature(original_push_to_hub)).replace("NoneType", "None")
signature = signature[1:]
signature = re.sub("<function save at .+?>", "torch.save", signature)
docs = original_push_to_hub.__doc__.encode("utf-8").decode("utf-8")
push_to_hub_text = f'''def unsloth_push_to_hub(self, {signature}:
"""
{docs}
"""
arguments = dict(locals())
del arguments["self"]
if "tags" in arguments and arguments["tags"] is not None:
assert(isinstance(arguments["tags"], (list, tuple)))
arguments["tags"] = list(arguments["tags"]) + ["unsloth",]
elif "tags" in arguments:
arguments["tags"] = ["unsloth",]
elif hasattr(self, "add_model_tags"):
self.add_model_tags(["unsloth",])
if "commit_message" in arguments:
commit_message = arguments["commit_message"]
if commit_message is not None:
if not commit_message.endswith(" "): commit_message += " "
if "Unsloth" not in commit_message:
commit_message += "(Trained with Unsloth)"
else:
commit_message = "Upload model trained with Unsloth"
arguments["commit_message"] = commit_message
if "commit_description" in arguments:
commit_description = arguments["commit_description"]
if commit_description is not None:
if not commit_description.endswith(" "): commit_description += " "
if "Unsloth" not in commit_description:
commit_description += "(Trained with Unsloth 2x faster)"
else:
commit_description = "Upload model trained with Unsloth 2x faster"
arguments["commit_description"] = commit_description
# Update model tag
if hasattr(self, "config"):
_ = upload_to_huggingface(
self, arguments["repo_id"], arguments["token"],
"finetuned", "trl", file_location = None,
old_username = None, private = arguments["private"],
)
pass
try:
self.original_push_to_hub(**arguments)
except:
del arguments["tags"]
self.original_push_to_hub(**arguments)
pass
if hasattr(self, "config"):
print("Saved model to https://huggingface.co/" + arguments["repo_id"])
pass
'''
exec(push_to_hub_text, globals())
original_model = model
while True:
if original_model.push_to_hub.__name__ != "unsloth_push_to_hub":
original_model.original_push_to_hub = original_model.push_to_hub
original_model.push_to_hub = types.MethodType(unsloth_push_to_hub, original_model)
if hasattr(original_model, "add_model_tags"):
original_model.add_model_tags(["unsloth",])
pass
pass
if hasattr(original_model, "model"): original_model = original_model.model
else: break
pass
# Add saving methods to top level model
if hasattr(model, "config"):
# Counteract tokenizers
model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model)
model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model)
model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model)
model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model)
model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model)
model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
pass
return model
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers import AutoTokenizer
from transformers.convert_slow_tokenizer import convert_slow_tokenizer
from transformers import PreTrainedTokenizerFast
import re
import os
from transformers.models.llama.modeling_llama import logger
from peft import PeftModelForCausalLM
import torch
import itertools
import collections
import numpy as np
import gc
import subprocess
__all__ = [
"load_correct_tokenizer",
"fix_sentencepiece_tokenizer",
"check_tokenizer",
"add_new_tokens",
"fix_sentencepiece_gguf",
]
IGNORED_TOKENIZER_CHECKING = frozenset((
"CodeLlamaTokenizerFast",
"CodeLlamaTokenizer",
))
IGNORED_TOKENIZER_NAMES = [
# "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
# "unsloth/Mistral-Nemo-Instruct-2407",
# "mistralai/Mistral-Nemo-Instruct-2407",
# "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
# "unsloth/Mistral-Nemo-Base-2407",
# "mistralai/Mistral-Nemo-Base-2407",
]
IGNORED_TOKENIZER_NAMES = frozenset(
[x.lower() for x in IGNORED_TOKENIZER_NAMES]
)
# Check environments
keynames = "\n" + "\n".join(os.environ.keys())
IS_COLAB_ENVIRONMENT = "\nCOLAB_" in keynames
IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
del keynames
def try_fix_tokenizer(tokenizer, prepend = True):
if hasattr(tokenizer, "_tokenizer"):
converted_tokenizer = tokenizer._tokenizer
else:
converted_tokenizer = convert_slow_tokenizer(tokenizer)
pass
tokenizer_string = converted_tokenizer.to_str()
# Llama does _apple. Sometimes this is wrong!!
prepend_text = '{"type":"Prepend","prepend":"▁"},'
if not prepend and prepend_text in tokenizer_string:
tokenizer_string = tokenizer_string.replace(prepend_text, "", 1)
pass
dir_names = dir(tokenizer)
# Get eos_token, bos_token etc
token_names = [x for x in dir_names if x.endswith("_token") and x.count("_") == 1]
for token_name in token_names:
token = getattr(tokenizer, token_name, None)
if token is None: continue
token_id = getattr(tokenizer, token_name + "_id", None)
# Locate the token's id mapping in the string
find_text = f'"id":{token_id},"content":"'
start = tokenizer_string.find(find_text) + len(find_text)
if start == -1: continue
end = tokenizer_string.find('",', start)
bad_token = tokenizer_string[start : end]
# Check if token is the actual same one - if not, edit it
if bad_token != token:
bad_text = f'{find_text}{bad_token}",'
good_text = f'{find_text}{token}",'
tokenizer_string = tokenizer_string.replace(bad_text, good_text, 1)
# And replace vocab section
bad_text = f'"{bad_token}":{token_id},'
good_text = f'"{token}":{token_id},'
tokenizer_string = tokenizer_string.replace(bad_text, good_text, 1)
pass
pass
fixed_tokenizer = converted_tokenizer.from_str(tokenizer_string)
return fixed_tokenizer
pass
def get_sorted_dict(dictionary):
sorted_keys = sorted(dictionary.values())
inverted_dictionary = { value : key for key, value in dictionary.items() }
sorted_dictionary = {}
for key in sorted_keys:
value = inverted_dictionary[key]
sorted_dictionary[value] = key
return sorted_dictionary
pass
def convert_to_fast_tokenizer(
slow_tokenizer,
temporary_location = "_unsloth_sentencepiece_temp",
):
is_fast = getattr(slow_tokenizer, "is_fast", False)
if is_fast: return slow_tokenizer
try:
tokenizer_name = slow_tokenizer.__class__.__name__
lowered_tokenizer_name = tokenizer_name.lower()
if lowered_tokenizer_name.endswith("tokenizer"):
class_name = lowered_tokenizer_name[:-len("tokenizer")]
FastTokenizer = eval(
f'__import__(f"transformers.models.{class_name}").{tokenizer_name}Fast'
)
else:
FastTokenizer = PreTrainedTokenizerFast
except:
FastTokenizer = PreTrainedTokenizerFast
pass
# Get all arguments (bos_token, etc)
docs = FastTokenizer.__doc__
docs = docs[docs.find("Args:"):]
args = re.findall(r"\n[\s]+([^\s]{1,}) \(", docs, flags = re.MULTILINE)
args = [x for x in args if not x.endswith("_file")]
# Also some missing maybe!
docs = PreTrainedTokenizerFast.__doc__
docs = docs[docs.find("Args:"):]
args2 = re.findall(r"\n[\s]+([^\s]{1,}) \(", docs, flags = re.MULTILINE)
args2 = [x for x in args2 if not x.endswith("_file")]
args = list(set(args + args2))
kwargs = {}
for arg in args: kwargs[arg] = getattr(slow_tokenizer, arg, None)
kwargs["tokenizer_object"] = try_fix_tokenizer(slow_tokenizer, prepend = True)
fast_tokenizer = FastTokenizer( **kwargs )
# Check if they're similar!
sorted_slow_tokenizer = get_sorted_dict(slow_tokenizer.get_vocab())
sorted_fast_tokenizer = get_sorted_dict(fast_tokenizer.get_vocab())
check_vocab = (sorted_slow_tokenizer == sorted_fast_tokenizer)
check_special = (slow_tokenizer.all_special_tokens == fast_tokenizer.all_special_tokens)
# Failure so return slow_tokenizer
if not check_vocab or not check_special: return slow_tokenizer
# Now confirm if they match
if not assert_same_tokenization(slow_tokenizer, fast_tokenizer):
# Maybe remove prepending of __apple?
kwargs["tokenizer_object"] = try_fix_tokenizer(slow_tokenizer, prepend = False)
fast_tokenizer = FastTokenizer( **kwargs )
if not assert_same_tokenization(slow_tokenizer, fast_tokenizer):
# Failure :(
return slow_tokenizer
pass
pass
# Also tokenizer.model is missing!
name = slow_tokenizer.name_or_path.replace("/", "_")
if not os.path.exists(temporary_location):
os.makedirs(temporary_location)
pass
new_location = f"{temporary_location}/{name}"
slow_tokenizer.save_pretrained(new_location)
fast_tokenizer.save_pretrained(new_location)
# Now load it!
fast_tokenizer = AutoTokenizer.from_pretrained(new_location)
if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
return fast_tokenizer
return slow_tokenizer
pass
# Check Mistral chat template without BOS / EOS
mistral_template = \
"{% if messages[0]['role'] == 'system' %}"\
"{% if messages[1]['role'] == 'user' %}"\
"{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[2:] %}"\
"{% else %}"\
"{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% endif %}"\
"{% else %}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '[INST] ' + message['content'] + ' [/INST]' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ message['content'] }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"
pass
# Check Llama chat template without BOS / EOS
llama_template = \
"{% if messages[0]['role'] == 'system' %}"\
"{% if messages[1]['role'] == 'user' %}"\
"{{ '[INST] <<SYS>>\n' + messages[0]['content'] + '\n<</SYS>>\n\n' + messages[1]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[2:] %}"\
"{% else %}"\
"{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% endif %}"\
"{% else %}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '[INST] ' + message['content'].strip() + ' [/INST]' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ ' ' + message['content'].strip() + ' ' }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"
pass
def assert_same_tokenization(slow_tokenizer, fast_tokenizer):
# Get eos_token, bos_token etc
dir_names = dir(slow_tokenizer)
special_tokens = list(filter(None, (
getattr(slow_tokenizer, x) for x in dir_names
if x.endswith("_token") and x.count("_") == 1
)))
all_special_tokens = list(set(special_tokens + slow_tokenizer.all_special_tokens))
# Check if chat template is enabled!
check_chat_template1 = True
check_chat_template2 = True
check_chat_template3 = True
"""
Weirdly Mistral tokenizers are actually correct??
Ie below will actually load mistral v1 and v3 incorrectly!
slow_chat_template = getattr(slow_tokenizer, "chat_template", None)
fast_chat_template = getattr(fast_tokenizer, "chat_template", None)
messages = [
{"role": "user", "content": " What is 2+2? "},
{"role": "assistant", "content": " It's 4. "},
]
# Check the tokenizer's own chat template
if slow_chat_template is not None and fast_chat_template is not None:
check_chat_template1 = \
slow_tokenizer.apply_chat_template(messages) == \
fast_tokenizer.apply_chat_template(messages)
pass
# Check Mistral chat template without BOS / EOS
slow_tokenizer.chat_template = mistral_template
fast_tokenizer.chat_template = mistral_template
check_chat_template2 = \
slow_tokenizer.apply_chat_template(messages) == \
fast_tokenizer.apply_chat_template(messages)
pass
# Check Llama chat template without BOS / EOS
slow_tokenizer.chat_template = llama_template
fast_tokenizer.chat_template = llama_template
check_chat_template3 = \
slow_tokenizer.apply_chat_template(messages) == \
fast_tokenizer.apply_chat_template(messages)
pass
# Combine them all and revert chat templates
slow_tokenizer.chat_template = slow_chat_template
fast_tokenizer.chat_template = fast_chat_template
"""
check_chat_template = check_chat_template1 and check_chat_template2 and check_chat_template3
# Try special tokens
try:
string = "\n".join(all_special_tokens) + \
"A quick brown fox jumps over the lazy dog!!\n\nHi</s>\n\n" + \
"".join(all_special_tokens)
check_special_tokens = \
slow_tokenizer(string).input_ids == \
fast_tokenizer(string).input_ids
return check_chat_template and check_special_tokens
except:
# For eg see https://github.com/unslothai/unsloth/issues/292
# Sometimes tokenizer has weird tokens, causing a combined tokenization to fail.
# [TODO] We temporarily disable this for CodeLlama tokenizers
if slow_tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
return check_chat_template
else:
return False
pass
pass
def fix_sentencepiece_tokenizer(
old_tokenizer,
new_tokenizer,
token_mapping,
temporary_location = "_unsloth_sentencepiece_temp",
):
# From https://github.com/google/sentencepiece/issues/121
# We need to manually edit the sentencepiece tokenizer!
from transformers.utils import sentencepiece_model_pb2
if not os.path.exists(temporary_location):
os.makedirs(temporary_location)
pass
# Check if tokenizer.model exists
if not os.path.isfile(f"{temporary_location}/tokenizer.model"):
return new_tokenizer
pass
# First save the old tokenizer
old_tokenizer.save_pretrained(temporary_location)
tokenizer_file = sentencepiece_model_pb2.ModelProto()
tokenizer_file.ParseFromString(open(f"{temporary_location}/tokenizer.model", "rb").read())
# Now save the new tokenizer
new_tokenizer.save_pretrained(temporary_location)
# Now correct the old tokenizer's .model file
for old_token, new_token in token_mapping.items():
ids = old_tokenizer([old_token], add_special_tokens = False).input_ids
ids = ids[0]
if (len(ids) != 1):
# Skip this token!
print(f"Skip mapping {old_token} to {new_token} since {new_token} is already in the tokenizer!")
continue
pass
ids = ids[0]
# [TODO] Hack for Starling - try except
try:
tokenizer_piece = tokenizer_file.pieces[ids]
except:
continue
assert(tokenizer_piece.piece == old_token)
tokenizer_piece.piece = new_token
pass
# And now write it
with open(f"{temporary_location}/tokenizer.model", "wb") as file:
file.write(tokenizer_file.SerializeToString())
pass
# And load it!
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
temporary_location,
eos_token = new_tokenizer.eos_token,
pad_token = new_tokenizer.pad_token,
)
return tokenizer
pass
def fix_sentencepiece_gguf(saved_location):
"""
Fixes sentencepiece tokenizers which did not extend the vocabulary with
user defined tokens.
Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py
"""
from copy import deepcopy
from transformers.utils import sentencepiece_model_pb2
import json
from enum import IntEnum
class SentencePieceTokenTypes(IntEnum):
NORMAL = 1
UNKNOWN = 2
CONTROL = 3
USER_DEFINED = 4
UNUSED = 5
BYTE = 6
pass
# Load tokenizer.model
tokenizer_file = sentencepiece_model_pb2.ModelProto()
if not os.path.isfile(f"{saved_location}/tokenizer.model"): return
tokenizer_file.ParseFromString(open(f"{saved_location}/tokenizer.model", "rb").read())
sentence_piece_size = len(tokenizer_file.pieces)
# Load added_tokens_json
if not os.path.isfile(f"{saved_location}/added_tokens.json"): return
with open(f"{saved_location}/added_tokens.json", "r", encoding = "utf-8") as file:
added_tokens_json = json.load(file)
pass
if len(added_tokens_json) == 0: return
added_tokens_json = dict(sorted(added_tokens_json.items(), key = lambda item: item[1]))
new_size = sentence_piece_size + len(added_tokens_json)
# Confirm added_tokens_json is correct
added_tokens_ids = np.array(list(added_tokens_json.values()))
diff = np.diff(added_tokens_ids)
if (diff.min() != 1 or diff.max() != 1): return
if (added_tokens_ids.min() != sentence_piece_size): return
# Edit sentence piece tokens with added_tokens_json
logger.warning(
f"Unsloth: Extending {saved_location}/tokenizer.model with added_tokens.json.\n"\
f"Originally tokenizer.model is of size ({sentence_piece_size}).\n"\
f"But we need to extend to sentencepiece vocab size ({new_size})."
)
new_tokens = deepcopy(tokenizer_file.pieces[-len(added_tokens_ids):])
for new_token, added_token in zip(new_tokens, added_tokens_json.keys()):
new_token.piece = added_token.encode("utf-8")
new_token.score = -1000.0
new_token.type = SentencePieceTokenTypes.USER_DEFINED
pass
tokenizer_file.pieces.extend(new_tokens)
with open(f"{saved_location}/tokenizer.model", "wb") as file:
file.write(tokenizer_file.SerializeToString())
pass
# Add padding tokens
# actual_vocab_size = model.config.vocab_size
# padding = actual_vocab_size - len(tokenizer_file.pieces)
return
pass
def load_correct_tokenizer(
tokenizer_name,
model_max_length = None,
padding_side = "right",
token = None,
trust_remote_code = False,
cache_dir = "huggingface_tokenizers_cache",
):
if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
cache_dir = cache_dir
else:
cache_dir = None
pass
# Try loading the slow tokenizer. If it fails, then try Fast only
# Mainly to solve Deepseek models with no tokenizer.model file
slow_tokenizer = None
try:
slow_tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
model_max_length = model_max_length,
padding_side = padding_side,
token = token,
trust_remote_code = trust_remote_code,
# Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
use_fast = False,
legacy = False,
from_slow = True,
cache_dir = cache_dir,
)
except:
pass
# print(
# f"Unsloth: {tokenizer_name} has no tokenizer.model file.\n"\
# "Just informing you about this - this is not a critical error."
# )
pass
fast_tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name,
model_max_length = model_max_length,
padding_side = padding_side,
token = token,
trust_remote_code = trust_remote_code,
cache_dir = cache_dir,
)
if tokenizer_name in IGNORED_TOKENIZER_NAMES:
return fast_tokenizer
elif slow_tokenizer is not None:
if hasattr(fast_tokenizer, "add_bos_token") and hasattr(slow_tokenizer, "add_bos_token"):
fast_tokenizer.add_bos_token = slow_tokenizer.add_bos_token
if hasattr(fast_tokenizer, "add_eos_token") and hasattr(slow_tokenizer, "add_eos_token"):
fast_tokenizer.add_eos_token = slow_tokenizer.add_eos_token
# Confirm if slow and fast are equivalent!
if assert_same_tokenization(slow_tokenizer, fast_tokenizer):
return fast_tokenizer
else:
logger.warning(f"Unsloth: Will load {tokenizer_name} as a legacy tokenizer.")
return convert_to_fast_tokenizer(slow_tokenizer)
pass
else:
return fast_tokenizer
pass
pass
def check_tokenizer(
model,
tokenizer,
model_name = "unsloth/llama-2-7b-bnb-4bit",
model_max_length = 4096,
padding_side = "right",
token = None,
_reload = True,
):
# Checks tokenizer for out of bounds ids.
# Mainly a fix for https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
# where <sep> had token id=32002.
# See https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha/discussions/25
# Seems like the Fast tokenizer in Rust breaks things!
# We ignore some of them!
if tokenizer.__repr__().split("(", 1)[0] in IGNORED_TOKENIZER_CHECKING:
return tokenizer
pass
max_embedding_size = model.model.embed_tokens.weight.shape[0]
added_tokens_fast = tokenizer.added_tokens_decoder
added_tokens_fast = {index : str(value) for index, value in added_tokens_fast.items()}
sorted_keys = sorted(added_tokens_fast)
added_tokens_fast = {key : added_tokens_fast[key] for key in sorted_keys}
for j, index in enumerate(added_tokens_fast.keys()):
if index >= max_embedding_size:
bad_indices = list(added_tokens_fast.keys ())[j:]
bad_tokens = list(added_tokens_fast.values())[j:]
if not _reload:
# Try removing the token
added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
special_tokens = tokenizer.special_tokens_map
import itertools
special_tokens = frozenset(
itertools.chain.from_iterable(
[x] if type(x) is str else x for x in special_tokens.values()
)
)
can_be_removed1 = [x for x in bad_tokens if x not in special_tokens]
can_be_removed2 = [x for x in can_be_removed1 if x in tokenizer._added_tokens_encoder.keys()]
# Check of extra tokens can in fact we removed!
can_be_removed = \
(len(can_be_removed1) == len(bad_tokens)) and \
(len(can_be_removed2) == len(bad_tokens))
# Check if sep_token or other generic types
remove_generic = False
try_mapper = []
if not can_be_removed:
names = dir(tokenizer)
names = (x for x in names if x.endswith("_token") and x.count("_") == 1)
generic_tokens = [(x, getattr(tokenizer, x, None)) for x in names]
try_removal = []
for token in bad_tokens:
for (name_token, check_token) in generic_tokens:
if check_token == token:
try_removal.append(token)
try_mapper.append(name_token)
pass
pass
pass
# Recheck!
can_be_removed = (len(try_removal) == len(bad_tokens))
if can_be_removed: remove_generic = True
can_be_removed1 = bad_tokens
pass
if can_be_removed:
# Yes it can be fixed!
for j, bad_token in enumerate(can_be_removed1):
remove_id = tokenizer._added_tokens_encoder[bad_token]
del tokenizer._added_tokens_decoder[remove_id]
del tokenizer._added_tokens_encoder[bad_token]
if remove_generic and (try_removal[j] == bad_token):
# Remove sep token for example
setattr(tokenizer, try_mapper[j], None)
setattr(tokenizer, try_mapper[j] + "_id", None)
pass
pass
# Confirm 1 more time!
if max(tokenizer.added_tokens_decoder.keys()) < max_embedding_size:
logger.warning_once(
f"Unsloth loaded a broken tokenizer `{model_name}`, but managed to repair it!\n"\
f"Tokens {bad_tokens} with ids {bad_indices} exceeds the max vocab size of {max_embedding_size}.\n"\
"We removed these bad tokens. If you think this is incorrect, fix your tokenizer first."
)
return convert_to_fast_tokenizer(tokenizer)
pass
pass
# :( Failure
raise RuntimeError(
f"Unsloth tried to load `{model_name}`, but cannot succeed.\n"\
f"Tokens {bad_tokens} with ids {bad_indices} exceeds the max vocab size of {max_embedding_size}.\n"\
f"Fix your tokenizer since it'll perform out of bounds memory accesses."
)
pass
if IS_COLAB_ENVIRONMENT or IS_KAGGLE_ENVIRONMENT:
cache_dir = "huggingface_tokenizers_cache"
else:
cache_dir = None
pass
# Sometimes slow tokenizer does not work like Deepseek
try:
# Try slow tokenizer which can fix things!
tokenizer = AutoTokenizer.from_pretrained(
model_name,
model_max_length = model_max_length,
padding_side = padding_side,
token = token,
# Cannot just use use_fast = False as per https://twitter.com/danielhanchen/status/1789659394302718373
use_fast = False,
legacy = False,
from_slow = True,
cache_dir = cache_dir,
)
return check_tokenizer(
model = model,
tokenizer = tokenizer,
model_name = model_name,
model_max_length = model_max_length,
padding_side = padding_side,
token = token,
_reload = False,
)
break
except:
# Tokenizer has out of bounds issues and we can't
# load the slow tokenizer version :(
logger.warning_once(
"Unsloth: Tokenizer is most likely buggy, and Unsloth failed to repair it.\n"\
"It will still work, but beware of out of bounds memory accesses.\n"\
"Please file an issue on the model owner's repo about this issue."
)
return tokenizer
pass
pass
pass
return convert_to_fast_tokenizer(tokenizer)
pass
@torch.inference_mode
def fix_untrained_tokens(model, tokenizer, train_dataset, eps = 1e-16):
"""
Llama-3 for eg has untrained vectors in the base model.
These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
We reset them to the mean of the rest of the tokens
"""
embedding_matrix = model.get_input_embeddings ().weight
lm_head_matrix = model.get_output_embeddings().weight
# Ignore some model checks for now
if model.config._name_or_path in IGNORED_TOKENIZER_NAMES:
return
pass
# Get untrained tokens
indicator_untrained1 = torch.amax(embedding_matrix, axis = 1) <= eps
# Check lm_head as well
indicator_untrained2 = torch.amax(lm_head_matrix, axis = 1) <= eps
# Combine both checks
indicator_untrained = indicator_untrained1 & indicator_untrained2
where_untrained = torch.where(indicator_untrained)[0]
n_untrained = where_untrained.shape[0]
n_trained = embedding_matrix.shape[0] - n_untrained
# Get set and actual tokens
where_untrained = where_untrained.tolist()
if len(where_untrained) == 0: return
# Remove untrained indices where it's longer
where_untrained_set = frozenset(where_untrained)
actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
# Remove None items in actual_bad_tokens
actual_bad_tokens = [x for x in actual_bad_tokens if x is not None]
# Check if tokenizer and training datasets have bad tokens
if_bad_first = False
if_bad_second = False
# Check tokenizer's chat template for any untrained tokens
chat_template = getattr(tokenizer, "chat_template", None)
if chat_template is not None:
if_bad_first = any(x in chat_template for x in actual_bad_tokens)
pass
# Check the first 250, last 250 input_ids
size_dataset = len(train_dataset)
size = min(size_dataset, 250)
for j in range(size):
input_ids = train_dataset[j]
if "input_ids" in input_ids:
input_ids = input_ids["input_ids"]
if_bad = any(item in where_untrained_set for item in input_ids)
if if_bad:
if_bad_second = True
break
pass
pass
pass
# Check last 250
if not if_bad_second:
left = max(size_dataset-250, 0)
for j in range(left, size_dataset):
input_ids = train_dataset[j]
if "input_ids" in input_ids:
input_ids = input_ids["input_ids"]
if_bad = any(item in where_untrained_set for item in input_ids)
if if_bad:
if_bad_second = True
break
pass
pass
pass
pass
# Check if bad tokens exists!
if not if_bad_first and not if_bad_second: return
# Check if lm_head / embed_token are trainable!
bad_not_trainable = False
if not embedding_matrix.requires_grad: bad_not_trainable = True
if not lm_head_matrix .requires_grad: bad_not_trainable = True
if bad_not_trainable:
raise ValueError(
'Unsloth: Untrained tokens found, but embed_tokens & lm_head not trainable, causing NaNs. '\
'Restart then add `embed_tokens` & `lm_head` to '\
'`FastLanguageModel.get_peft_model(target_modules = [..., "embed_tokens", "lm_head",]). `'\
'Are you using the `base` model? Instead, use the `instruct` version to silence this warning.',
)
pass
# Count all the possible bad tokens
final_counts = np.zeros(max(len(tokenizer), embedding_matrix.shape[0]), dtype = np.int64)
def mapping(examples):
input_ids = examples["input_ids"]
counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype = np.int32)
np.add.at(final_counts, counter, 1)
pass
train_dataset.map(mapping, batched = True, desc = "Counting untrained tokens")
# Get sum of all items
sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0)
# Remove bad tokens
sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0)
sum_lm_head -= torch.sum(lm_head_matrix [where_untrained], dtype = torch.float32, axis = 0)
# Find correct average by dividing by sum of trained tokens
mean_embedding = (sum_embedding / n_trained)
mean_lm_head = (sum_lm_head / n_trained)
# Scale each to be equal to 1/max_frequency. Also set some to 0 if none seen
scaling = final_counts[where_untrained] / max(final_counts.max(), 1)
scaling = torch.tensor(scaling, device = mean_embedding.device).unsqueeze(1)
mean_embedding = mean_embedding.repeat((n_untrained, 1,)) * scaling
mean_lm_head = mean_lm_head .repeat((n_untrained, 1,)) * scaling
where_null = scaling.ravel() == 0
mean_embedding[where_null] = 0
mean_lm_head [where_null] = 0
# Set them to the mean
logger.warning(
"Unsloth: Setting embed_tokens & lm_head untrained tokens to "\
"mean(trained) to counteract NaNs during training."
)
embedding_matrix[where_untrained] = mean_embedding.to(embedding_matrix.dtype)
lm_head_matrix [where_untrained] = mean_lm_head .to(lm_head_matrix .dtype)
# Clean up
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
pass
return
pass
@torch.inference_mode
def mean_of_trained_tokens(model, eps = 1e-16):
"""
Llama-3 for eg has untrained vectors in the base model.
These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
We reset them to the mean of the rest of the tokens
"""
embedding_matrix = model.get_input_embeddings ().weight.clone()
lm_head_matrix = model.get_output_embeddings().weight.clone()
# Get untrained tokens
indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
where_untrained = torch.where(indicator_untrained)[0]
n_untrained = where_untrained.shape[0]
n_trained = embedding_matrix.shape[0] - n_untrained
# if n_untrained != 0:
# print(
# f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
# "We shall set them to the mean of the other trained tokens."
# )
# pass
# Get sum of all items
sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0)
# Remove bad tokens
sum_embedding -= torch.sum(embedding_matrix[where_untrained], dtype = torch.float32, axis = 0)
sum_lm_head -= torch.sum(lm_head_matrix [where_untrained], dtype = torch.float32, axis = 0)
# Find correct average by dividing by sum of trained tokens
mean_embedding = (sum_embedding / n_trained)
mean_lm_head = (sum_lm_head / n_trained)
return mean_embedding, mean_lm_head
pass
@torch.inference_mode
def add_new_tokens(
model,
tokenizer,
new_tokens = [],
method = "mean",
interpolation = 0.5,
):
"""
Smartly resizes the tokenizer and adds new tokens to the model.
We also disregard untrained tokens by removing them from the mean calculation.
"""
assert(isinstance(new_tokens, (list, tuple)))
assert(len(new_tokens) > 0)
assert(method == "mean" or method == "interpolation")
assert(interpolation >= 0 and interpolation <= 1)
# Check if tokens already exist
overlapping_tokens = set(new_tokens) & set(tokenizer.vocab.keys())
if len(overlapping_tokens) != 0:
print(
f"Unsloth: You're adding new_tokens = {new_tokens}\n"\
f"There are tokens which are overlapping = {list(overlapping_tokens)}\n"\
f"We shall safely ignore these overlapping tokens."
)
new_tokens = [x for x in new_tokens if x not in overlapping_tokens]
pass
# Get mean of trained tokens
# mean_embedding, mean_lm_head = fix_untrained_tokens(model)
# Weirdly be careful reserved tokens can pop out
mean_embedding, mean_lm_head = mean_of_trained_tokens(model)
mean_embedding = mean_embedding.to(torch.float32)
mean_lm_head = mean_lm_head .to(torch.float32)
# Add tokens!
old_length = len(tokenizer)
tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))
# If we use interpolation, we interpolate between the mean embeddings and
# the Word2Vec sum of the other vectors
embedding_matrix = model.get_input_embeddings ().weight
lm_head_matrix = model.get_output_embeddings().weight
if method == "interpolation":
print(
"Unsloth: You are using interpolation to add new tokens.\n"\
f"We shall set new tokens = mean(embeddings)*{1-interpolation} + mean(new_tokens)*{interpolation}"
)
for j, token in enumerate(new_tokens):
input_ids = tokenizer(token, add_special_tokens = False).input_ids
mean_embedding_token = embedding_matrix[input_ids].mean(axis = 0, dtype = torch.float32)
mean_lm_head_token = lm_head_matrix [input_ids].mean(axis = 0, dtype = torch.float32)
# Interpolate
mean_embedding_token = mean_embedding*(1-interpolation) + mean_embedding_token*interpolation
mean_lm_head_token = mean_lm_head *(1-interpolation) + mean_lm_head_token *interpolation
# Set the new vector
embedding_matrix[old_length+j] = mean_embedding_token
lm_head_matrix [old_length+j] = mean_lm_head_token
pass
else:
# Now set the new tokens to the mean!
embedding_matrix[old_length:] = mean_embedding
lm_head_matrix [old_length:] = mean_lm_head
pass
# We set a flag to say we need to train embeddings
internal_model = model
while hasattr(internal_model, "model"):
internal_model._need_to_train_embeddings = True
internal_model = internal_model.model
pass
internal_model._need_to_train_embeddings = True
return
pass
def check_nvidia():
# Unsloth doesn't work yet on AMD devices - we're working on it!
output = np.array([0,])
try:
output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
output = np.array([int(x.decode('utf-8'))/1024 for x in output])
except:
if not torch.cuda.is_available():
raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
return output
pass
PRE_CHECK = check_nvidia()
from inspect import getsource
import trl.trainer.sft_trainer
from trl.trainer.sft_trainer import *
from transformers.trainer import *
def patch_sft_trainer_tokenizer():
"""
Patches the trainer with changes
"""
for function_name, replacer in (
("_prepare_non_packed_dataloader", "def tokenize(element):",),
# ("_prepare_packed_dataloader", "if dataset_text_field is not None",),
):
function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
where = function.find("def")
function = function.split("\n")
function = "\n".join(x[where:] for x in function)
check_text = \
"\n"\
"test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])[0]\n"\
"chat_template = getattr(tokenizer, 'chat_template', None)\n"\
"chat_template = '' if chat_template is None else chat_template\n"\
"has_bos_token_already = (test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template) "\
"if getattr(tokenizer, 'bos_token', None) is not None else False\n"\
"add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n"
check_text = check_text.split("\n")
check_text = "\n".join(" "*where + x for x in check_text)
function = function.replace(replacer, check_text + replacer)
exec(function, globals())
exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
pass
# Patch train with fix_untrained_tokens
function_name, replacer = "train", "if resume_from_checkpoint is False:"
function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
where = function.find("def")
function = function.split("\n")
function = "\n".join(x[where:] for x in function)
check_text = \
"\n"\
"if self._inner_training_loop.__name__ != '_fast_inner_training_loop':\n"\
" raise RuntimeError(\n"\
" 'Please do not edit specific areas of the Unsloth codebase or you will get CUDA segfaults.'\n"\
" )\n"\
"pass\n"\
"import subprocess, re, gc, numpy as np\n"\
"a = np.array([0,])\n"\
"try:\n"\
" a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
" a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
" a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
"except:\n"\
" if not torch.cuda.is_available():\n"\
" raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
"if ((a - PRE_CHECK) >= 1).sum() > 1:\n"\
" raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
"for _ in range(3):\n"\
" gc.collect()\n"\
" torch.cuda.empty_cache()\n"\
"pass\n"\
"\n"\
"fix_untrained_tokens(self.model, self.tokenizer, self.train_dataset, eps = 1e-16)\n\n"
check_text = check_text.split("\n")
check_text = "\n".join(" "*where + x for x in check_text)
function = function.replace(replacer, check_text + replacer)
exec(function, globals())
exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
pass
patch_sft_trainer_tokenizer()
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from typing import Optional
from transformers import TrainingArguments
from trl import SFTTrainer
from . import is_bfloat16_supported
__all__ = [
"UnslothTrainingArguments",
"UnslothTrainer",
]
@dataclass
class UnslothTrainingArguments(TrainingArguments):
embedding_learning_rate : Optional[float] = field(
default = None,
metadata = {"help" : "Different learning rates for embeddings and lm_head."}
)
pass
def _create_unsloth_optimizer(
model,
optimizer_cls,
optimizer_kwargs,
embedding_lr = 5e-5,
):
lr = optimizer_kwargs["lr"]
weight_decay = optimizer_kwargs.get("weight_decay", 0.0)
param_groups = \
{
"non_embeddings" : {},
"embeddings" : {},
}
for name, param in model.named_parameters():
if not param.requires_grad: continue
if name.endswith("modules_to_save.default.weight"):
partial_name = name[:-len(".modules_to_save.default.weight")]
partial_name = partial_name[partial_name.rfind(".")+1:]
print(f"Unsloth: Setting lr = {embedding_lr:.2e} instead of {lr:.2e} for {partial_name}.")
param_groups["embeddings"] [name] = param
else:
param_groups["non_embeddings"][name] = param
pass
pass
optimizer_grouped_parameters = [
{
"params" : list(param_groups["non_embeddings"].values()),
"weight_decay" : weight_decay,
"lr" : lr,
},
{
"params" : list(param_groups["embeddings"].values()),
"weight_decay" : weight_decay,
"lr" : embedding_lr,
},
]
optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
return optimizer
pass
class UnslothTrainer(SFTTrainer):
def create_optimizer(self):
embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
if embedding_learning_rate is None: return super().create_optimizer()
if self.optimizer is None:
optimizer_cls, optimizer_kwargs = SFTTrainer.get_optimizer_cls_and_kwargs(self.args)
self.optimizer = _create_unsloth_optimizer(
self.model,
optimizer_cls,
optimizer_kwargs,
embedding_learning_rate,
)
pass
return self.optimizer
pass
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment