Commit a380ace2 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .llama import *
from ._utils import __version__
try:
from transformers.models.gemma.modeling_gemma import (
GemmaAttention,
GemmaDecoderLayer,
GemmaModel,
GemmaForCausalLM,
GemmaRotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
except:
from packaging.version import Version
transformers_version = Version(transformers_version)
if not transformers_version >= Version("4.38"):
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
f"The minimum required version is 4.38.\n"\
f'Try `pip install --upgrade "transformers>=4.38"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
pass
pass
from transformers.modeling_attn_mask_utils import (
_prepare_4d_causal_attention_mask_for_sdpa,
)
# For Pytorch 2.1.1
try:
from transformers.models.gemma.modeling_gemma import (
GemmaSdpaAttention,
GemmaFlashAttention2,
)
except:
GemmaSdpaAttention = GemmaAttention
GemmaFlashAttention2 = GemmaAttention
pass
torch_nn_functional_gelu = torch.nn.functional.gelu
def fast_geglu_inference(self, X):
# gate = self.gate_proj(X)
# up = self.up_proj(X)
bsz, _, hd = X.shape
# mlp_size = self.config.intermediate_size
# temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda:0")
gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0])
up = fast_linear_forward(self. up_proj, X)#, out = temp[1])
gate = torch_nn_functional_gelu(gate, approximate = "tanh")
gate *= up
# X = self.down_proj(gate)
down = fast_linear_forward(self.down_proj, gate, out = up[:,:,:hd])
return down
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
def GemmaDecoderLayer_fast_forward(
self,
hidden_states: torch.Tensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
padding_mask: Optional[torch.LongTensor] = None,
*args, **kwargs,
):
if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda:0")
# Self Attention
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(self.input_layernorm, hidden_states, out_weight)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states += residual
# Fully Connected
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(self.post_attention_layernorm, hidden_states, out_weight)
hidden_states = fast_geglu_inference(self.mlp, hidden_states)
hidden_states += residual
else:
residual = hidden_states
hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states, gemma = True)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states, gemma = True)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
pass
outputs = (hidden_states,)
if output_attentions: outputs += (self_attn_weights,)
if use_cache: outputs += (present_key_value,)
return outputs
pass
from math import sqrt as math_sqrt
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
# @torch.inference_mode
def GemmaModel_fast_forward_inference(
self,
input_ids,
past_key_values,
position_ids,
attention_mask = None,
):
out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda:0")
input_ids = input_ids[:,:self.max_seq_length]
hidden_states = self.model.embed_tokens(input_ids)
hidden_states = hidden_states.to(self.config.torch_dtype)
# 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
# 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
hidden_states *= torch.tensor(math_sqrt(self.config.hidden_size), dtype = hidden_states.dtype)
bsz, q_len, hd = hidden_states.shape
seq_len = past_key_values[0][0].shape[-2]
if bsz != 1:
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(bsz, q_len),
hidden_states,
seq_len,
)
pass
next_decoder_cache = []
for idx, decoder_layer in enumerate(self.model.layers):
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.input_layernorm, hidden_states, out_weight)
hidden_states, present_key_value = LlamaAttention_fast_forward_inference(
decoder_layer.self_attn,
hidden_states = hidden_states,
past_key_value = past_key_values[idx],
position_ids = position_ids,
attention_mask = attention_mask,
do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
)
hidden_states += residual
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.post_attention_layernorm, hidden_states, out_weight)
hidden_states = fast_geglu_inference(decoder_layer.mlp, hidden_states)
hidden_states += residual
next_decoder_cache.append(present_key_value)
pass
hidden_states = fast_rms_layernorm_inference_gemma(self.model.norm, hidden_states, out_weight)
return BaseModelOutputWithPast(
last_hidden_state = hidden_states,
past_key_values = next_decoder_cache,
hidden_states = [],
attentions = [],
)
pass
# Follows line by line https://github.com/google-deepmind/gemma/blob/main/gemma/positional_embeddings.py#L45
# Formulates cos and sin differently from Llama!
class GemmaFixedRotaryEmbedding(torch.nn.Module):
# Fixes https://github.com/huggingface/transformers/pull/28837
# https://github.com/microsoft/DeepSpeed/issues/4932
# The precision of RoPE buffers is not correct, so we cast to int64.
def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
config = None, # [TODO] Hack to pass in config - need to remove later
):
super().__init__()
if config is not None: return # [TODO] Hack to pass in config - need to remove later
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
# Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
pass
def _set_cos_sin_cache(self, seq_len, device, dtype):
# Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
# in FP32. They are applied (multiplied) in FP32 as well.
self.current_rope_size = seq_len
# The difference is we do division explicity instead of t * (1/x) ie we do t/x.
freq_exponents = (2.0 / self.dim) * (
torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
)
timescale = self.base**freq_exponents
positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float()
radians_new = positions[..., None] / timescale[None, None, :]
radians_new = radians_new.squeeze(0)
emb = torch.cat((radians_new, radians_new), dim = -1)
# We must do RoPE in float32!
cos = emb.cos().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
sin = emb.sin().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
self.register_buffer("cos_cached", cos, persistent = False)
self.register_buffer("sin_cached", sin, persistent = False)
pass
def forward(self, x, position_ids=None, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.current_rope_size:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
pass
def extend_rope_embedding(self, x, seq_len):
if seq_len <= self.current_rope_size: return
# Iteratively grow by increments of 8192
self.current_rope_size = int(round(seq_len / 8192)) * 8192
self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
pass
pass
class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
"""LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
# Fixes https://github.com/huggingface/transformers/pull/28837
# https://github.com/microsoft/DeepSpeed/issues/4932
# The precision of RoPE buffers is not correct, so we cast to int64.
def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
config = None, # [TODO] Hack to pass in config - need to remove later
):
self.scaling_factor = scaling_factor
super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
pass
def _set_cos_sin_cache(self, seq_len, device, dtype):
# Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
# in FP32. They are applied (multiplied) in FP32 as well.
self.current_rope_size = seq_len
# The difference is we do division explicity instead of t * (1/x) ie we do t/x.
freq_exponents = (2.0 / self.dim) * (
torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
)
timescale = self.base**freq_exponents
positions = torch.arange(self.current_rope_size, device = "cpu", dtype = torch.int64).float()
positions = positions / self.scaling_factor
radians_new = positions[..., None] / timescale[None, None, :]
radians_new = radians_new.squeeze(0)
emb = torch.cat((radians_new, radians_new), dim = -1)
# We must do RoPE in float32!
cos = emb.cos().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
sin = emb.sin().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
self.register_buffer("cos_cached", cos, persistent = False)
self.register_buffer("sin_cached", sin, persistent = False)
pass
pass
class FastGemmaModel(FastLlamaModel):
@staticmethod
def pre_patch():
init_name, function = patch_linear_scaling(
model_name = "gemma",
rope_module = GemmaFixedRotaryEmbedding,
scaled_rope_module = GemmaFixedLinearScalingRotaryEmbedding,
attention_module = GemmaAttention,
)
if init_name is not None:
exec(function, globals())
GemmaAttention.__init__ = eval(init_name)
pass
GemmaAttention .forward = LlamaAttention_fast_forward
GemmaSdpaAttention .forward = LlamaAttention_fast_forward
GemmaFlashAttention2.forward = LlamaAttention_fast_forward
GemmaDecoderLayer .forward = GemmaDecoderLayer_fast_forward
GemmaModel .forward = LlamaModel_fast_forward
GemmaForCausalLM .forward = CausalLM_fast_forward(GemmaModel_fast_forward_inference)
PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
fix_prepare_inputs_for_generation(GemmaForCausalLM)
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
import transformers.models.gemma.modeling_gemma
transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding = GemmaFixedRotaryEmbedding
return
pass
@staticmethod
def post_patch(model):
# Patch model for Gemma
layers = model.model.layers
# Torch.compile fails on embedding matrix??
# Workaround randomnly fixes it for torch versions < 2.2
model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
model.config.update({"unsloth_version" : __version__})
# We also do this for the lm_head
lm_head = torch.nn.Linear(1, 1, bias = None)
del lm_head.weight
lm_head.weight = model.lm_head.weight
lm_head.in_features = lm_head.weight.shape[1]
lm_head.out_features = lm_head.weight.shape[0]
model.lm_head = lm_head
# Gemma has tied weights! This means lm_head == embed_tokens
if model.model.embed_tokens.weight.data_ptr() != model.lm_head.weight.data_ptr():
lm_head = torch.nn.Linear(1, 1, bias = None)
del lm_head.weight
lm_head.weight = model.model.embed_tokens.weight
lm_head.in_features = lm_head.weight.shape[1]
lm_head.out_features = lm_head.weight.shape[0]
model.lm_head = lm_head
pass
# Also patch all dtypes - BnB seems to not allocate the correct type?
# BnB default dtype seems to be float16!
correct_dtype = lm_head.weight.dtype
for name, module in model.named_modules():
if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)):
weight = module.weight
quant_state = weight.quant_state
if type(quant_state) is list:
# BnB seems to have float16 as default!
module.weight.quant_state[2] = correct_dtype # Cast to correct dtype
else:
# https://github.com/TimDettmers/bitsandbytes/pull/763/files
quant_state.dtype = correct_dtype
pass
pass
# Downcast RoPE embedding to correct data type
# RoPE must be done in float32 for Gemma
# if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")) \
# and (module.cos_cached.dtype != correct_dtype):
# module.cos_cached = module.cos_cached.to(correct_dtype)
# module.sin_cached = module.sin_cached.to(correct_dtype)
# pass
# pass
pass
# Add 1 to weight
# return output * (1 + self.weight)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L89
from transformers.models.gemma.modeling_gemma import GemmaRMSNorm
# Freeze all parameters except LoRA
# We do this first since += 1 seems to not be liked by requires_grad = True
for name, param in model.named_parameters():
if ".lora_A." in name or ".lora_B." in name:
param.requires_grad_(True)
else:
param.requires_grad_(False)
pass
# Patch RMS Layernorm
for name, module in model.named_modules():
if isinstance(module, GemmaRMSNorm):
# Must be in float32
# https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L36
# module = module.to(torch.float32)
# Leave + 1 to Triton kernel itself
# module.weight += 1.0 # return output * (1 + self.weight)
if not hasattr(module, "variance_epsilon"):
module.variance_epsilon = module.eps # Gemma doesn't use variance_epsilon
pass
# Clear deleted GPU items
import gc
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
return model
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .llama import *
from ._utils import __version__
from .gemma import (
GemmaFixedRotaryEmbedding,
GemmaFixedLinearScalingRotaryEmbedding,
fast_geglu_inference,
)
try:
from transformers.models.gemma2.modeling_gemma2 import (
Gemma2Attention,
Gemma2DecoderLayer,
Gemma2Model,
Gemma2ForCausalLM,
Gemma2RotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
except:
from packaging.version import Version
transformers_version = Version(transformers_version)
if not transformers_version >= Version("4.42"):
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
f"The minimum required version is 4.42.3.\n"\
f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
pass
pass
from transformers.modeling_attn_mask_utils import (
_prepare_4d_causal_attention_mask_for_sdpa,
)
# For Pytorch 2.1.1
try:
from transformers.models.gemma2.modeling_gemma2 import (
Gemma2SdpaAttention,
Gemma2FlashAttention2,
)
except:
Gemma2SdpaAttention = Gemma2Attention
Gemma2FlashAttention2 = Gemma2Attention
pass
# [TODO] We must randomnly use torch.compile?
# I checked the gradients and formulas and I'm sure it's correct.
# I'm stumped :(
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True):
old_dtype = X.dtype
X = X.float()
X = X * torch.rsqrt(X.square().mean(-1, keepdim = True) + layernorm.eps) * \
(1.0 + layernorm.weight.float())
return X.to(old_dtype)
pass
# Logit softcapping
def Gemma2Attention_fast_forward(
self,
hidden_states: torch.Tensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
padding_mask: Optional[torch.LongTensor] = None,
*args, **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# Clear inference
if hasattr(self, "paged_attention"):
del self.paged_attention_K
del self.paged_attention_V
del self.paged_attention
del self.temp_QA
del self.temp_KV
del self.RH_Q
del self.attention
pass
bsz, q_len, _ = hidden_states.size()
n_heads = self.num_heads
n_groups = self.num_key_value_groups
n_kv_heads = self.num_key_value_heads
head_dim = self.head_dim
assert(n_kv_heads * n_groups == n_heads)
Q, K, V = self.apply_qkv(self, hidden_states)
Q = Q.view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
kv_seq_len = K.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
if position_ids is None:
cos = self.rotary_emb.cos_cached
sin = self.rotary_emb.sin_cached
Q, K = fast_rope_embedding(Q, K, cos, sin)
else:
cos, sin = self.rotary_emb(V, seq_len = kv_seq_len)
Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
pass
if past_key_value is not None:
K = torch.cat([past_key_value[0], K], dim = 2)
V = torch.cat([past_key_value[1], V], dim = 2)
pass
past_key_value = (K, V) if use_cache else None
A = slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, kv_seq_len)
A = self.apply_o(self, A)
return A, None, past_key_value
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
def Gemma2DecoderLayer_fast_forward(
self,
hidden_states: torch.Tensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
padding_mask: Optional[torch.LongTensor] = None,
*args, **kwargs,
):
if use_cache and hasattr(self, "_flag_for_generation"): #past_key_value is not None:
out_weight = torch.empty(self.input_layernorm.weight.shape, dtype = torch.float32, device = "cuda:0")
# Self Attention
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(self.input_layernorm, hidden_states, out_weight)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states = fast_rms_layernorm_inference_gemma(self.post_attention_layernorm, hidden_states, out_weight)
hidden_states += residual
# Fully Connected
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(self. pre_feedforward_layernorm, hidden_states, out_weight)
hidden_states = fast_geglu_inference(self.mlp, hidden_states)
hidden_states = fast_rms_layernorm_inference_gemma(self.post_feedforward_layernorm, hidden_states, out_weight)
hidden_states += residual
else:
residual = hidden_states
hidden_states = fast_rms_layernorm_gemma2_compiled(self.input_layernorm, hidden_states, gemma = True)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states = fast_rms_layernorm_gemma2_compiled(self.post_attention_layernorm, hidden_states, gemma = True)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = fast_rms_layernorm_gemma2_compiled(self. pre_feedforward_layernorm, hidden_states, gemma = True)
hidden_states = self.mlp(hidden_states)
hidden_states = fast_rms_layernorm_gemma2_compiled(self.post_feedforward_layernorm, hidden_states, gemma = True)
hidden_states = residual + hidden_states
pass
outputs = (hidden_states,)
if output_attentions: outputs += (self_attn_weights,)
if use_cache: outputs += (present_key_value,)
return outputs
pass
from math import sqrt as math_sqrt
KV_CACHE_INCREMENT = 256 # KV Cache update size
torch_nn_functional_softmax = torch.nn.functional.softmax
def Gemma2Attention_fast_forward_inference(
self,
hidden_states: torch.Tensor,
past_key_value: Optional[Tuple[torch.Tensor]],
position_ids,
do_prefill = False,
attention_mask = None,
use_sliding_window = False,
):
Xn = hidden_states
bsz, _, hd = hidden_states.size()
K1, V1 = past_key_value
dtype = Xn.dtype
n_heads = self.num_heads
n_groups = self.num_key_value_groups
n_kv_heads = self.num_key_value_heads
head_dim = self.head_dim
attention_size = n_heads*head_dim
# assert(n_kv_heads * n_groups == n_heads)
seq_len = K1.shape[-2]
kv_seq_len = seq_len + 1
# Prefill phase
# if not hasattr(self, "paged_attention"):
if do_prefill:
self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda:0")
self.paged_attention_K = self.paged_attention[:,0]
self.paged_attention_V = self.paged_attention[:,1]
self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
# Only for Gemma2
self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
# See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
# Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
# We default to using the config file itself
# s = self.config.hidden_size // self.config.num_attention_heads
self.scalar = 1.0 / math_sqrt(self.config.query_pre_attn_scalar)
# self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
self.half_head_dim = head_dim // 2
self. t = self.config.attn_logit_softcapping
self.reciprocal_t = 1.0 / self.config.attn_logit_softcapping
elif kv_seq_len >= self.paged_attention.shape[0]:
self.paged_attention.resize_((self.paged_attention.shape[0]+KV_CACHE_INCREMENT, 2, bsz, n_kv_heads, head_dim))
self.paged_attention_K = self.paged_attention[:,0]
self.paged_attention_V = self.paged_attention[:,1]
self.attention.resize_((bsz, n_heads, 1, self.attention.shape[-1]+KV_CACHE_INCREMENT))
pass
Qn = fast_linear_forward(self.q_proj, Xn, out = self.temp_QA[0])
Kn = fast_linear_forward(self.k_proj, Xn, out = self.temp_KV[0])
Vn = fast_linear_forward(self.v_proj, Xn, out = self.temp_KV[1])
Qn = Qn.view(bsz, 1, n_heads, head_dim).transpose(1, 2)
Kn = Kn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
Vn = Vn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
# cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
# Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
cos = self.rotary_emb.cos_cached[position_ids].unsqueeze(1)
sin = self.rotary_emb.sin_cached[position_ids].unsqueeze(1)
h = self.half_head_dim
RH_Q = self.RH_Q
RH_Q[:,:,:,:h] = Qn[:,:,:,h:]
RH_Q[:,:,:,h:] = Qn[:,:,:,:h]
torch.neg(RH_Q[:,:,:,:h], out = RH_Q[:,:,:,:h])
Qn *= cos
Qn.addcmul_(RH_Q, sin)
RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
RH_K[:,:,:,:h] = Kn[:,:,:,h:]
RH_K[:,:,:,h:] = Kn[:,:,:,:h]
torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
Kn *= cos
Kn.addcmul_(RH_K, sin)
# New KV cache
# Kn = torch.cat([K1, Kn], dim = 2)
# Vn = torch.cat([V1, Vn], dim = 2)
self.paged_attention_K[seq_len] = Kn.permute(2, 0, 1, 3)
self.paged_attention_V[seq_len] = Vn.permute(2, 0, 1, 3)
Kn = self.paged_attention_K[:kv_seq_len].permute(1, 2, 0, 3)
Vn = self.paged_attention_V[:kv_seq_len].permute(1, 2, 0, 3)
# Handle sliding windows
sliding_window = self.config.sliding_window
if use_sliding_window and kv_seq_len > sliding_window:
# From https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py#L193
slicing_tokens = 1 - sliding_window
Knn = Kn[:, :, slicing_tokens:, :]#.contiguous()
Vnn = Vn[:, :, slicing_tokens:, :]#.contiguous()
else:
Knn, Vnn = Kn, Vn
pass
# Grouped query attention
_, _, cached_len, _ = Knn.shape
if n_groups != 1:
Knn = Knn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Vnn = Vnn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Knn = Knn.reshape(bsz, n_heads, cached_len, head_dim)
Vnn = Vnn.reshape(bsz, n_heads, cached_len, head_dim)
pass
# else:
# Knn, Vnn = Knn, Vnn
# pass
# Attention
# if bsz == 1:
Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
# It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
# if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
A *= self.reciprocal_t; torch.tanh(A, out = A); A *= self.t; # Logit softcapping
A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
A = torch.matmul(A, Vnn, out = Qn)
# else:
# A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
# pass
A = A.transpose(1, 2)
A = A.reshape(bsz, 1, attention_size)
A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
return A, (Kn, Vn)
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
# @torch.inference_mode
def Gemma2Model_fast_forward_inference(
self,
input_ids,
past_key_values,
position_ids,
attention_mask = None,
):
out_weight = torch.empty_like(self.model.layers[0].input_layernorm.weight, dtype = torch.float32, device = "cuda:0")
input_ids = input_ids[:,:self.max_seq_length]
hidden_states = self.model.embed_tokens(input_ids)
hidden_states = hidden_states.to(self.config.torch_dtype)
# 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
# 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
hidden_states *= torch.tensor(math_sqrt(self.config.hidden_size), dtype = hidden_states.dtype)
bsz, q_len, hd = hidden_states.shape
seq_len = past_key_values[0][0].shape[-2]
if bsz != 1:
SWA = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(bsz, q_len),
hidden_states,
seq_len,
sliding_window = self.config.sliding_window,
)
GA = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(bsz, q_len),
hidden_states,
seq_len,
)
else:
SWA = attention_mask
GA = attention_mask
pass
next_decoder_cache = []
for idx, decoder_layer in enumerate(self.model.layers):
use_sliding_window = idx % 2 == 0
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.input_layernorm, hidden_states, out_weight)
hidden_states, present_key_value = Gemma2Attention_fast_forward_inference(
decoder_layer.self_attn,
hidden_states = hidden_states,
past_key_value = past_key_values[idx],
position_ids = position_ids,
attention_mask = SWA if use_sliding_window else GA,
do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
use_sliding_window = use_sliding_window,
)
hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.post_attention_layernorm, hidden_states, out_weight)
hidden_states += residual
residual = hidden_states
hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer. pre_feedforward_layernorm, hidden_states, out_weight)
hidden_states = fast_geglu_inference(decoder_layer.mlp, hidden_states)
hidden_states = fast_rms_layernorm_inference_gemma(decoder_layer.post_feedforward_layernorm, hidden_states, out_weight)
hidden_states += residual
next_decoder_cache.append(present_key_value)
pass
hidden_states = fast_rms_layernorm_inference_gemma(self.model.norm, hidden_states, out_weight)
return BaseModelOutputWithPast(
last_hidden_state = hidden_states,
past_key_values = next_decoder_cache,
hidden_states = [],
attentions = [],
)
pass
class FastGemma2Model(FastLlamaModel):
@staticmethod
def pre_patch():
init_name, function = patch_linear_scaling(
model_name = "gemma2",
rope_module = GemmaFixedRotaryEmbedding,
scaled_rope_module = GemmaFixedLinearScalingRotaryEmbedding,
attention_module = Gemma2Attention,
)
if init_name is not None:
exec(function, globals())
Gemma2Attention.__init__ = eval(init_name)
pass
Gemma2Attention .forward = Gemma2Attention_fast_forward
Gemma2SdpaAttention .forward = Gemma2Attention_fast_forward
Gemma2FlashAttention2.forward = Gemma2Attention_fast_forward
Gemma2DecoderLayer .forward = Gemma2DecoderLayer_fast_forward
Gemma2Model .forward = LlamaModel_fast_forward
Gemma2ForCausalLM .forward = CausalLM_fast_forward(Gemma2Model_fast_forward_inference)
PeftModelForCausalLM .forward = PeftModelForCausalLM_fast_forward
fix_prepare_inputs_for_generation(Gemma2ForCausalLM)
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
import transformers.models.gemma2.modeling_gemma2
transformers.models.gemma2.modeling_gemma2.Gemma2RotaryEmbedding = GemmaFixedRotaryEmbedding
return
pass
@staticmethod
def post_patch(model):
# Patch model for Gemma
layers = model.model.layers
# Torch.compile fails on embedding matrix??
# Workaround randomnly fixes it for torch versions < 2.2
model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
model.config.update({"unsloth_version" : __version__})
# We also do this for the lm_head
lm_head = torch.nn.Linear(1, 1, bias = None)
del lm_head.weight
lm_head.weight = model.lm_head.weight
lm_head.in_features = lm_head.weight.shape[1]
lm_head.out_features = lm_head.weight.shape[0]
model.lm_head = lm_head
# Gemma has tied weights! This means lm_head == embed_tokens
if model.model.embed_tokens.weight.data_ptr() != model.lm_head.weight.data_ptr():
lm_head = torch.nn.Linear(1, 1, bias = None)
del lm_head.weight
lm_head.weight = model.model.embed_tokens.weight
lm_head.in_features = lm_head.weight.shape[1]
lm_head.out_features = lm_head.weight.shape[0]
model.lm_head = lm_head
pass
# Also patch all dtypes - BnB seems to not allocate the correct type?
# BnB default dtype seems to be float16!
correct_dtype = lm_head.weight.dtype
for name, module in model.named_modules():
if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)):
weight = module.weight
quant_state = weight.quant_state
if type(quant_state) is list:
# BnB seems to have float16 as default!
module.weight.quant_state[2] = correct_dtype # Cast to correct dtype
else:
# https://github.com/TimDettmers/bitsandbytes/pull/763/files
quant_state.dtype = correct_dtype
pass
pass
# Downcast RoPE embedding to correct data type
# RoPE must be done in float32 for Gemma
# if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")) \
# and (module.cos_cached.dtype != correct_dtype):
# module.cos_cached = module.cos_cached.to(correct_dtype)
# module.sin_cached = module.sin_cached.to(correct_dtype)
# pass
# pass
pass
# Add 1 to weight
# return output * (1 + self.weight)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L89
from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm
# Freeze all parameters except LoRA
# We do this first since += 1 seems to not be liked by requires_grad = True
for name, param in model.named_parameters():
if ".lora_A." in name or ".lora_B." in name:
param.requires_grad_(True)
else:
param.requires_grad_(False)
pass
# Patch RMS Layernorm
for name, module in model.named_modules():
if isinstance(module, Gemma2RMSNorm):
# Must be in float32
# https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L36
# module = module.to(torch.float32)
# Leave + 1 to Triton kernel itself
# module.weight += 1.0 # return output * (1 + self.weight)
if not hasattr(module, "variance_epsilon"):
module.variance_epsilon = module.eps # Gemma doesn't use variance_epsilon
pass
# Clear deleted GPU items
import gc
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
return model
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import gc
from typing import Optional, Tuple, List, Union
from ._utils import *
from ._utils import __version__
from torch.nn.functional import scaled_dot_product_attention
from transformers.models.llama.modeling_llama import (
logger,
BaseModelOutputWithPast,
CausalLMOutputWithPast,
)
from transformers.modeling_attn_mask_utils import (
_prepare_4d_causal_attention_mask_for_sdpa,
)
from ..kernels import *
from ..tokenizer_utils import *
if HAS_FLASH_ATTENTION:
from flash_attn import flash_attn_func
# Final patching code
from transformers.models.llama.modeling_llama import (
LlamaAttention,
LlamaDecoderLayer,
LlamaModel,
LlamaForCausalLM,
)
# For Pytorch 2.1.1
try:
from transformers.models.llama.modeling_llama import (
LlamaSdpaAttention,
LlamaFlashAttention2,
)
except:
LlamaSdpaAttention = LlamaAttention
LlamaFlashAttention2 = LlamaAttention
pass
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
from transformers import set_seed as transformers_set_seed
from peft import LoraConfig, TaskType, get_peft_model as _get_peft_model
from peft import PeftModelForCausalLM
from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
from peft.tuners.lora import Linear4bit as Peft_Linear4bit
from ..save import patch_saving_functions
import re, os, inspect, math, sys
def original_apply_qkv(self, X):
Q = self.q_proj(X)
K = self.k_proj(X)
V = self.v_proj(X)
return Q, K, V
pass
def original_apply_o(self, X):
O = self.o_proj(X)
return O
pass
from math import sqrt as math_sqrt
KV_CACHE_INCREMENT = 256 # KV Cache update size
torch_nn_functional_softmax = torch.nn.functional.softmax
# Fix new HF's inference code
def _fast_prepare_inputs_for_generation(self, input_ids, **kwargs,):
if "past_key_values" in kwargs:
input_ids = input_ids[:,[-1]]
kwargs["attention_mask"] = kwargs["attention_mask"][:,[-1]]
if "cache_position" in kwargs:
kwargs["position_ids"] = kwargs["cache_position"]
return { "input_ids" : input_ids, **kwargs, }
pass
def fix_prepare_inputs_for_generation(module):
# Fix prepare_inputs_for_generation
if hasattr(module, "prepare_inputs_for_generation"):
module.prepare_inputs_for_generation = _fast_prepare_inputs_for_generation
pass
pass
torch_matmul = torch.matmul
def LlamaAttention_fast_forward_inference(
self,
hidden_states: torch.Tensor,
past_key_value: Optional[Tuple[torch.Tensor]],
position_ids,
do_prefill = False,
attention_mask = None,
):
"""
https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L406
Fast inference using KV cache.
QK^T can be computed in 4 chunks
[Q, q] @ [K, k].T where q, k are the new tokens.
[QK^T, Qk^T]
[qK^T, qk^T]
Since the attention mask wipes Qk^T, we just get
[QK^T, 0]
[qK^T, qk^T]
Since softmax is row-wise, we get
softmax([QK^T, 0])
softmax([qK^T, qk^T])
We then multiply by [V]
[v]
softmax([QK^T, 0]) [softmax(QK^T)V] *
softmax([qK^T, qk^T]) [softmax([qK^T, qk^T]) @ [V, v]]
But notice * [softmax(QK^T)V] is just the last attention.
We just need to compute the last final row.
This means we can pass in a row of Q, but we need to
remember K and V, which are called the KV cache.
"""
Xn = hidden_states
bsz, _, hd = hidden_states.size()
K1, V1 = past_key_value
dtype = Xn.dtype
n_heads = self.num_heads
n_groups = self.num_key_value_groups
n_kv_heads = self.num_key_value_heads
head_dim = self.head_dim
attention_size = n_heads*head_dim
# assert(n_kv_heads * n_groups == n_heads)
seq_len = K1.shape[-2]
kv_seq_len = seq_len + 1
# Prefill phase
# if not hasattr(self, "paged_attention"):
if do_prefill:
self.paged_attention = torch.empty((KV_CACHE_INCREMENT+seq_len+1, 2, bsz, n_kv_heads, head_dim), dtype = dtype, device = "cuda:0")
self.paged_attention_K = self.paged_attention[:,0]
self.paged_attention_V = self.paged_attention[:,1]
self.paged_attention_K[:seq_len] = K1.permute(2, 0, 1, 3)
self.paged_attention_V[:seq_len] = V1.permute(2, 0, 1, 3)
self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
# Mistral Nemo 12b has weird dimensions
if attention_size != self.hidden_size:
self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
else:
self.temp_O = self.temp_QA[1][:,:,:self.hidden_size]
pass
self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
self.scalar = 1.0 / math_sqrt(self.head_dim)
self.half_head_dim = head_dim // 2
elif kv_seq_len >= self.paged_attention.shape[0]:
self.paged_attention.resize_((self.paged_attention.shape[0]+KV_CACHE_INCREMENT, 2, bsz, n_kv_heads, head_dim))
self.paged_attention_K = self.paged_attention[:,0]
self.paged_attention_V = self.paged_attention[:,1]
self.attention.resize_((bsz, n_heads, 1, self.attention.shape[-1]+KV_CACHE_INCREMENT))
pass
Qn = fast_linear_forward(self.q_proj, Xn, out = self.temp_QA[0])
Kn = fast_linear_forward(self.k_proj, Xn, out = self.temp_KV[0])
Vn = fast_linear_forward(self.v_proj, Xn, out = self.temp_KV[1])
Qn = Qn.view(bsz, 1, n_heads, head_dim).transpose(1, 2)
Kn = Kn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
Vn = Vn.view(bsz, 1, n_kv_heads, head_dim).transpose(1, 2)
# cos, sin = self.rotary_emb(Vn, seq_len = kv_seq_len)
# Qn, Kn = inplace_rope_embedding(Qn, Kn, cos, sin, position_ids)
cos = self.rotary_emb.cos_cached[position_ids].unsqueeze(1)
sin = self.rotary_emb.sin_cached[position_ids].unsqueeze(1)
h = self.half_head_dim
RH_Q = self.RH_Q
RH_Q[:,:,:,:h] = Qn[:,:,:,h:]
RH_Q[:,:,:,h:] = Qn[:,:,:,:h]
torch.neg(RH_Q[:,:,:,:h], out = RH_Q[:,:,:,:h])
Qn *= cos
Qn.addcmul_(RH_Q, sin)
RH_K = RH_Q[:,:n_kv_heads,:,:] # torch.empty((n_kv_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
RH_K[:,:,:,:h] = Kn[:,:,:,h:]
RH_K[:,:,:,h:] = Kn[:,:,:,:h]
torch.neg(RH_K[:,:,:,:h], out = RH_K[:,:,:,:h])
Kn *= cos
Kn.addcmul_(RH_K, sin)
# New KV cache
# Kn = torch.cat([K1, Kn], dim = 2)
# Vn = torch.cat([V1, Vn], dim = 2)
self.paged_attention_K[seq_len] = Kn.permute(2, 0, 1, 3)
self.paged_attention_V[seq_len] = Vn.permute(2, 0, 1, 3)
Kn = self.paged_attention_K[:kv_seq_len].permute(1, 2, 0, 3)
Vn = self.paged_attention_V[:kv_seq_len].permute(1, 2, 0, 3)
# Handle sliding windows
sliding_window = getattr(self.config, "sliding_window", None)
if sliding_window is not None and kv_seq_len > sliding_window:
# From https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py#L193
slicing_tokens = 1 - sliding_window
Knn = Kn[:, :, slicing_tokens:, :]#.contiguous()
Vnn = Vn[:, :, slicing_tokens:, :]#.contiguous()
else:
Knn, Vnn = Kn, Vn
pass
# Grouped query attention
_, _, cached_len, _ = Knn.shape
if n_groups != 1:
Knn = Knn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Vnn = Vnn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Knn = Knn.reshape(bsz, n_heads, cached_len, head_dim)
Vnn = Vnn.reshape(bsz, n_heads, cached_len, head_dim)
pass
# else:
# Knn, Vnn = Knn, Vnn
# pass
# Attention
if bsz == 1:
Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
# It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
A = torch_matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
# if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
A = torch_matmul(A, Vnn, out = Qn)
else:
A = scaled_dot_product_attention(Qn, Knn, Vnn, attn_mask = attention_mask, is_causal = False)
pass
A = A.transpose(1, 2)
A = A.reshape(bsz, 1, attention_size)
A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
return A, (Kn, Vn)
pass
torch_nn_functional_silu = torch.nn.functional.silu
def fast_swiglu_inference(self, X):
# gate = self.gate_proj(X)
# up = self.up_proj(X)
bsz, _, hd = X.shape
# mlp_size = self.config.intermediate_size
# temp = torch.empty((2, bsz, 1, mlp_size), dtype = X.dtype, device = "cuda:0")
gate = fast_linear_forward(self.gate_proj, X)#, out = temp[0])
up = fast_linear_forward(self. up_proj, X)#, out = temp[1])
gate = torch_nn_functional_silu(gate, inplace = True)
gate *= up
# X = self.down_proj(gate)
down = fast_linear_forward(self.down_proj, gate, out = up[:,:,:hd])
return down
pass
def fast_rms_layernorm_inference(self, X):
old_dtype = X.dtype
XX = X.to(torch.float32)
variance = XX.square().mean(-1, keepdim = True)
variance += self.variance_epsilon
XX *= variance.rsqrt_()
X = XX.to(old_dtype) # Must preserve due to residual
X *= self.weight
return X
pass
def fast_rms_layernorm_inference_gemma(self, X, out_weight = None):
XX = X.to(torch.float32)
variance = XX.square().mean(-1, keepdim = True)
variance += self.variance_epsilon
XX *= variance.rsqrt_()
if out_weight is None:
out_weight = self.weight + 1.0
else:
out_weight[:] = self.weight
out_weight += 1.0
pass
XX *= out_weight
return XX.to(X.dtype)
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L320
def LlamaAttention_fast_forward(
self,
hidden_states: torch.Tensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
padding_mask: Optional[torch.LongTensor] = None,
*args, **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# Clear inference
if hasattr(self, "paged_attention"):
del self.paged_attention_K
del self.paged_attention_V
del self.paged_attention
del self.temp_QA
del self.temp_KV
del self.RH_Q
del self.attention
pass
bsz, q_len, _ = hidden_states.size()
n_heads = self.num_heads
n_groups = self.num_key_value_groups
n_kv_heads = self.num_key_value_heads
head_dim = self.head_dim
assert(n_kv_heads * n_groups == n_heads)
Q, K, V = self.apply_qkv(self, hidden_states)
Q = Q.view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
kv_seq_len = K.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
# Extend RoPE dynamically to fit in VRAM
self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
if position_ids is None:
cos = self.rotary_emb.cos_cached
sin = self.rotary_emb.sin_cached
Q, K = fast_rope_embedding(Q, K, cos, sin)
else:
cos, sin = self.rotary_emb(V, seq_len = kv_seq_len)
Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
pass
if past_key_value is not None:
K = torch.cat([past_key_value[0], K], dim = 2)
V = torch.cat([past_key_value[1], V], dim = 2)
pass
past_key_value = (K, V) if use_cache else None
# Attention module
if (not HAS_FLASH_ATTENTION and attention_mask is None):
# Xformers memory efficient attention
# Also has Flash Attention v2 dispatching
Q = Q.transpose(1, 2)
K = K.transpose(1, 2)
V = V.transpose(1, 2)
# Group query attention
if n_groups != 1:
K = K .view(bsz, kv_seq_len, n_kv_heads, 1, head_dim)
V = V .view(bsz, kv_seq_len, n_kv_heads, 1, head_dim)
K = K.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
V = V.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
if hidden_states.requires_grad:
K = K.reshape(bsz, kv_seq_len, n_heads, head_dim)
V = V.reshape(bsz, kv_seq_len, n_heads, head_dim)
else:
Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
pass
A = xformers_attention(Q, K, V, attn_bias = causal_mask)
A = A.view(bsz, q_len, n_heads, head_dim)
elif HAS_FLASH_ATTENTION and attention_mask is None:
Q = Q.transpose(1, 2).half()
K = K.transpose(1, 2).half()
V = V.transpose(1, 2).half()
A = flash_attn_func(Q, K, V, causal = True)
else:
# Grouped query attention
if n_groups != 1:
K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
K = K.reshape(bsz, n_heads, kv_seq_len, head_dim)
V = V.reshape(bsz, n_heads, kv_seq_len, head_dim)
pass
# Must be contiguous or else results are False!
# https://github.com/pytorch/pytorch/issues/112577
Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
# Needs (batch_size, n_heads, seq_len, head_dim)
# is_casual and attention_mask must not be both set!
A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, is_causal = False)
# Go back to (batch_size, seq_len, n_heads, head_dim)
A = A.transpose(1, 2).contiguous()
pass
attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
attn_output = self.apply_o(self, attn_output)
attn_weights = None
return attn_output, attn_weights, past_key_value
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L590
def LlamaDecoderLayer_fast_forward(
self,
hidden_states: torch.Tensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
padding_mask: Optional[torch.LongTensor] = None,
*args, **kwargs,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
if use_cache and hasattr(self, "_flag_for_generation"):
residual = hidden_states
hidden_states = fast_rms_layernorm_inference(self.input_layernorm, hidden_states)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states += residual
# Fully Connected
residual = hidden_states
hidden_states = fast_rms_layernorm_inference(self.post_attention_layernorm, hidden_states)
hidden_states = fast_swiglu_inference(self.mlp, hidden_states)
hidden_states += residual
else:
residual = hidden_states
hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = fast_rms_layernorm(self.post_attention_layernorm, hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
pass
outputs = (hidden_states,)
if output_attentions: outputs += (self_attn_weights,)
if use_cache: outputs += (present_key_value,)
return outputs
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
def LlamaModel_fast_forward(
self,
input_ids: torch.LongTensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
*args, **kwargs,
) -> Union[Tuple, BaseModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
assert(output_attentions is False)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("Unsloth: You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("Unsloth: You have to specify either decoder_input_ids or decoder_inputs_embeds")
seq_length_with_past = seq_length
# Fix out of bounds tokenization
if hasattr(self, "max_seq_length"):
if seq_length > self.max_seq_length:
logger.warning_once(
f"Unsloth: Input IDs of length {seq_length} > the model's max sequence length of {self.max_seq_length}.\n"\
"We shall truncate it ourselves. It's imperative if you correct this issue first."
)
if input_ids is not None:
input_ids = input_ids[:,:self.max_seq_length]
elif inputs_embeds is not None:
inputs_embeds = inputs_embeds[:,:self.max_seq_length,:]
pass
pass
past_key_values_length = 0
if past_key_values is not None:
past_key_values_length = past_key_values[0][0].shape[2]
seq_length_with_past = seq_length_with_past + past_key_values_length
pass
# We already handle KV cache position_ids ourselves.
if False:#(past_key_values_length != 0):
position_ids = torch.arange(
past_key_values_length, seq_length + past_key_values_length,
dtype = torch.int32,
device = "cuda:0",
)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
elif position_ids is not None:
position_ids = position_ids.view(-1, seq_length).to(torch.int32)#.long()
else:
position_ids = None
pass
if position_ids is not None:
if position_ids.shape[0] != batch_size:
position_ids = position_ids.repeat((batch_size, 1))
pass
# Embed positions
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
# Normalized from Gemma
IS_GEMMA = self.config.model_type.startswith("gemma")
IS_GEMMA2 = self.config.model_type.startswith("gemma2")
train_embed_tokens = self.embed_tokens.weight.requires_grad
if IS_GEMMA:
# Match Gemma exactly by casting to bfloat16 / float16
# inputs_embeds *= math_sqrt(self.config.hidden_size)
# Ie 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
# & 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
normalizer = torch.tensor(math_sqrt(self.config.hidden_size), dtype = inputs_embeds.dtype)
if train_embed_tokens:
# Careful we must not do an inplace op!
inputs_embeds = inputs_embeds * normalizer
else:
inputs_requires_grad = inputs_embeds.requires_grad
if not inputs_embeds.is_leaf:
inputs_embeds = inputs_embeds.detach()
inputs_requires_grad = True
elif inputs_requires_grad:
inputs_embeds.requires_grad_(False)
pass
inputs_embeds *= normalizer
# inputs_embeds *= math_sqrt(self.config.hidden_size)
if inputs_requires_grad: inputs_embeds.requires_grad_(True)
pass
pass
# Fix up attention mask by setting elements to 0
# Specifically for DPO
if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
(not train_embed_tokens):
# Careful for inference the attention_mask is size (1, kv_seq_len)
# Whilst the input_embeds is size (1, 1, 4096)
inputs_requires_grad = inputs_embeds.requires_grad
if not inputs_embeds.is_leaf:
inputs_embeds = inputs_embeds.detach()
inputs_requires_grad = True
elif inputs_requires_grad:
inputs_embeds.requires_grad_(False)
pass
inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2)
if inputs_requires_grad: inputs_embeds.requires_grad_(True)
pass
# Ignore attention_mask
if attention_mask is None:
padding_mask = None
elif self.training:
attention_mask = None
padding_mask = None
else:
# if 0 in attention_mask:
# padding_mask = attention_mask
# else:
padding_mask = None
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(batch_size, seq_length),
inputs_embeds,
past_key_values_length,
sliding_window = getattr(self.config, "sliding_window", None),
)
pass
hidden_states = inputs_embeds
if past_key_values is None and self.training:
use_cache = False
# if use_cache:
# logger.warning_once(
# "Unsloth: `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`"
# )
# use_cache = False
pass
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = () if use_cache else None
# Gradient checkpointing methods (ie sqrt)
if hasattr(self, "_gradient_checkpointing_boundaries"):
boundaries = self._gradient_checkpointing_boundaries
else:
boundaries = None
pass
# Check checkpointing method
gradient_checkpointing = False
offloaded_gradient_checkpointing = False
if (self.gradient_checkpointing and self.training and not use_cache):
gradient_checkpointing = True
if output_attentions is False and hasattr(self, "_offloaded_gradient_checkpointing"):
offloaded_gradient_checkpointing = True
pass
# Check for Flex Attention
# if IS_GEMMA2 and HAS_FLEX_ATTENTION:
# if not (seq_length % FLEX_ATTENTION_PADDING == 0):
# USE_FLEX_ATTENTION = True
# Gemma2 has alternating SWA and global attn
if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
n = self.config.max_position_embeddings
# masked_fill is making stuff slower!
# self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
# self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
self.SWA_mask = AttentionMaskConverter(
is_causal = True,
sliding_window = self.config.sliding_window,
)\
.to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
.squeeze(0).squeeze(0)
self.GA_mask = AttentionMaskConverter(
is_causal = True,
)\
.to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
.squeeze(0).squeeze(0)
pass
# Go through every layer!
for idx, decoder_layer in enumerate(self.layers):
if output_hidden_states: all_hidden_states += (hidden_states,)
past_key_value = past_key_values[idx] if past_key_values is not None else None
mask = causal_mask
if IS_GEMMA2: mask = self.SWA_mask if (idx % 2 == 0) else self.GA_mask
if offloaded_gradient_checkpointing:
hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
decoder_layer,
hidden_states,
mask,
attention_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
)[0]
elif gradient_checkpointing:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask)
return custom_forward
pass
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(decoder_layer),
hidden_states,
mask,
attention_mask,
position_ids,
use_reentrant = True,
preserve_rng_state = False,
)
hidden_states = layer_outputs[0]
else:
layer_outputs = decoder_layer(
hidden_states,
causal_mask=mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
padding_mask=padding_mask,
)
hidden_states = layer_outputs[0]
pass
if use_cache: next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
if output_attentions: all_self_attns += (layer_outputs[1],)
pass
# Final layernorm
if use_cache:
hidden_states = (fast_rms_layernorm_inference_gemma if IS_GEMMA else fast_rms_layernorm_inference)\
(self.norm, hidden_states)
else:
hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA)
pass
if output_hidden_states: all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if not return_dict:
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
pass
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L825
def LlamaModel_fast_forward_inference(
self,
input_ids,
past_key_values,
position_ids,
attention_mask = None,
):
input_ids = input_ids[:,:self.max_seq_length]
hidden_states = self.model.embed_tokens(input_ids)
hidden_states = hidden_states.to(self.config.torch_dtype)
bsz, q_len, hd = hidden_states.shape
seq_len = past_key_values[0][0].shape[-2]
if bsz != 1:
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
(bsz, q_len),
hidden_states,
seq_len,
sliding_window = getattr(self.config, "sliding_window", None),
)
else:
attention_mask = None
pass
next_decoder_cache = []
for idx, decoder_layer in enumerate(self.model.layers):
residual = hidden_states
hidden_states = fast_rms_layernorm_inference(decoder_layer.input_layernorm, hidden_states)
hidden_states, present_key_value = LlamaAttention_fast_forward_inference(
decoder_layer.self_attn,
hidden_states = hidden_states,
past_key_value = past_key_values[idx],
position_ids = position_ids,
attention_mask = attention_mask,
do_prefill = not hasattr(decoder_layer.self_attn, "paged_attention"),
)
hidden_states += residual
residual = hidden_states
hidden_states = fast_rms_layernorm_inference(decoder_layer.post_attention_layernorm, hidden_states)
hidden_states = fast_swiglu_inference(decoder_layer.mlp, hidden_states)
hidden_states += residual
next_decoder_cache.append(present_key_value)
pass
hidden_states = fast_rms_layernorm_inference(self.model.norm, hidden_states)
return BaseModelOutputWithPast(
last_hidden_state = hidden_states,
past_key_values = next_decoder_cache,
hidden_states = [],
attentions = [],
)
pass
def CausalLM_fast_forward(fast_forward_inference):
def _CausalLM_fast_forward(
self,
input_ids: torch.LongTensor = None,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
*args, **kwargs,
) -> Union[Tuple, CausalLMOutputWithPast]:
if past_key_values is not None:
outputs = fast_forward_inference(
self,
input_ids,
past_key_values,
position_ids = position_ids,
attention_mask = attention_mask,
)
else:
causal_mask = xformers.attn_bias.LowerTriangularMask()
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
self.model._has_no_labels = labels is None
outputs = self.model(
input_ids=input_ids,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pass
hidden_states = outputs[0]
bsz, q_len, hd = hidden_states.shape
lm_head = self.lm_head.weight
if bsz == 1 and q_len == 1:
logits = torch.mv(lm_head, hidden_states.ravel().to(lm_head.dtype))
logits = logits.unsqueeze(0).unsqueeze(0)
else:
logits = self.lm_head(hidden_states.to(lm_head.dtype))
pass
logits = logits.to(self.config.torch_dtype)
loss = None
logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
if labels is not None:
shift_logits = logits
if not hasattr(self, "extra_ignored_labels"):
# Fixes https://github.com/unslothai/unsloth/issues/10
self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda:0")
pass
shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
loss = fast_cross_entropy_loss(
logits = shift_logits,
labels = shift_labels,
logit_softcapping = logit_softcapping,
)
elif logit_softcapping != 0:
if logits.requires_grad:
logits = (1.0 / logit_softcapping) * logits
logits = torch.tanh(logits)
logits = logit_softcapping * logits
else:
logits *= (1.0 / logit_softcapping)
torch.tanh(logits, out = logits)
logits *= logit_softcapping
pass
pass
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
pass
return _CausalLM_fast_forward
pass
def PeftModelForCausalLM_fast_forward(
self,
input_ids=None,
causal_mask=None,
attention_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
task_ids=None,
**kwargs,
):
return self.base_model(
input_ids=input_ids,
causal_mask=causal_mask,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
labels=labels,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)
pass
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
class LlamaRotaryEmbedding(torch.nn.Module):
# Fixes https://github.com/huggingface/transformers/pull/28837
# https://github.com/microsoft/DeepSpeed/issues/4932
# The precision of RoPE buffers is not correct, so we cast to int64.
def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
config = None, # [TODO] Hack to pass in config - need to remove later
):
super().__init__()
if config is not None:
# [TODO] Hack to pass in config - need to remove later
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads))
device = "cuda"
max_position_embeddings = config.max_position_embeddings
pass
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
# Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
pass
def _set_cos_sin_cache(self, seq_len, device, dtype):
# Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
# in FP32. They are applied (multiplied) in FP32 as well.
self.current_rope_size = seq_len
inv_freq = 1.0 / (
self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
)
t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
freqs = torch.outer(t, inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
pass
def forward(self, x, position_ids=None, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.current_rope_size:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype = x.dtype),
self.sin_cached[:seq_len].to(dtype = x.dtype),
)
pass
def extend_rope_embedding(self, x, seq_len):
if seq_len <= self.current_rope_size: return
# Iteratively grow by increments of 8192
self.current_rope_size = int(round(seq_len / 8192)) * 8192
self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
pass
pass
class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
"""LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
# Fixes https://github.com/huggingface/transformers/pull/28837
# https://github.com/microsoft/DeepSpeed/issues/4932
# The precision of RoPE buffers is not correct, so we cast to int64.
def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0,
config = None, # [TODO] Hack to pass in config - need to remove later
):
self.scaling_factor = scaling_factor
super().__init__(dim = dim, max_position_embeddings = max_position_embeddings, base = base, device = device, config = config)
pass
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.current_rope_size = seq_len
inv_freq = 1.0 / (
self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
)
t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
t = t / self.scaling_factor
freqs = torch.outer(t, inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
pass
pass
# See https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py#L736
# For Llama 3.1
class LlamaExtendedRotaryEmbedding(torch.nn.Module):
def __init__(self, dim = None, max_position_embeddings=2048, base=10000, device=None,
config = None, # [TODO] Hack to pass in config - need to remove later
):
super().__init__()
if config is not None:
# [TODO] Hack to pass in config - need to remove later
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads))
device = "cuda"
max_position_embeddings = config.max_position_embeddings
pass
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
# Dynamic RoPE we first set it to a max of 4 * 8192 tokens then we iteratively grow this
self.current_rope_size = min(4 * 8192, self.max_position_embeddings)
# Normal Llama-3 RoPE
inv_freq = 1.0 / (
self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device="cpu").float() / self.dim)
)
inv_freq = self.apply_scaling(inv_freq)
self.register_buffer("inv_freq", inv_freq, persistent = False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(seq_len=self.current_rope_size, device=device, dtype=torch.get_default_dtype())
pass
def _set_cos_sin_cache(self, seq_len, device, dtype):
# Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
# in FP32. They are applied (multiplied) in FP32 as well.
self.current_rope_size = seq_len
t = torch.arange(self.current_rope_size, device="cpu", dtype=torch.int64).float()
freqs = torch.outer(t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype=dtype, device=device, non_blocking=True), persistent=False)
pass
# From https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py#L41
def apply_scaling(self, freqs: torch.Tensor):
# Values obtained from grid search
scale_factor = 8
low_freq_factor = 1
high_freq_factor = 4
old_context_len = 8192 # original llama3 length
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
new_freqs = []
for freq in freqs:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
new_freqs.append(freq)
elif wavelen > low_freq_wavelen:
new_freqs.append(freq / scale_factor)
else:
assert low_freq_wavelen != high_freq_wavelen
smooth = (old_context_len / wavelen - low_freq_factor) / (
high_freq_factor - low_freq_factor
)
new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
pass
def forward(self, x, position_ids=None, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.current_rope_size:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype = x.dtype),
self.sin_cached[:seq_len].to(dtype = x.dtype),
)
pass
def extend_rope_embedding(self, x, seq_len):
if seq_len <= self.current_rope_size: return
# Iteratively grow by increments of 8192
self.current_rope_size = int(round(seq_len / 8192)) * 8192
self._set_cos_sin_cache(self.current_rope_size, device = "cuda:0", dtype = x.dtype)
pass
pass
def _wrap_fast_inference(generate, device_type, dtype, model):
# Wraps inference with bfloat16 / float16
@torch.inference_mode
def _fast_generate(*args, **kwargs):
# Set a flag for generation!
internal_model = model
while hasattr(internal_model, "model"):
internal_model._flag_for_generation = True
internal_model = internal_model.model
pass
internal_model._flag_for_generation = True
# For newer HF
kwargs["cache_implementation"] = "dynamic"
# Remove token_type_ids
kwargs.pop("token_type_ids", None)
# Check pad_token
model_eos_token_id = getattr(model.config, "eos_token_id", None)
if model_eos_token_id is not None and hasattr(model_eos_token_id, "__iter__"):
model_eos_token_id = model_eos_token_id[0]
kwargs["pad_token_id"] = kwargs.pop("pad_token_id", model_eos_token_id)
# Set pad token
# old_pad_token_id = getattr(model.config, "pad_token_id", None)
# old_eos_token_id = getattr(model.config, "eos_token_id", None)
# model.config.pad_token_id = old_eos_token_id
# Autocasted
with torch.autocast(device_type = device_type, dtype = dtype):
output = generate(*args, **kwargs)
pass
# Revert
# model.config.pad_token_id = old_pad_token_id
# Unset a flag for generation!
internal_model = model
while hasattr(internal_model, "model"):
if hasattr(internal_model, "_flag_for_generation"): del internal_model._flag_for_generation
internal_model = internal_model.model
pass
if hasattr(internal_model, "_flag_for_generation"): del internal_model._flag_for_generation
return output
pass
return _fast_generate
pass
class FastLlamaModel:
@staticmethod
def pre_patch():
init_name, function = patch_llama_rope_scaling(
model_name = "llama",
rope_module = LlamaRotaryEmbedding,
scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
extended_rope_module = LlamaExtendedRotaryEmbedding,
attention_module = LlamaAttention,
)
if init_name is not None:
exec(function, globals())
LlamaAttention.__init__ = eval(init_name)
pass
LlamaAttention .forward = LlamaAttention_fast_forward
LlamaSdpaAttention .forward = LlamaAttention_fast_forward
LlamaFlashAttention2.forward = LlamaAttention_fast_forward
LlamaDecoderLayer .forward = LlamaDecoderLayer_fast_forward
LlamaModel .forward = LlamaModel_fast_forward
LlamaForCausalLM .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
fix_prepare_inputs_for_generation(LlamaForCausalLM)
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
import transformers.models.llama.modeling_llama
transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = LlamaRotaryEmbedding
transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding = LlamaLinearScalingRotaryEmbedding
return
pass
@staticmethod
def from_pretrained(
model_name = "unsloth/llama-3-8b-bnb-4bit",
max_seq_length = None,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
model_patcher = None,
tokenizer_name = None,
trust_remote_code = False,
**kwargs,
):
if trust_remote_code:
print(
"Unsloth: WARNING `trust_remote_code` is True.\n"\
"Are you certain you want to do remote code execution?"
)
pass
if token is None and "HF_TOKEN" in os.environ:
token = os.environ["HF_TOKEN"]
if token is None and "HUGGINGFACE_TOKEN" in os.environ:
token = os.environ["HUGGINGFACE_TOKEN"]
if model_patcher is None: model_patcher = FastLlamaModel
SUPPORTS_BFLOAT16 = is_bfloat16_supported()
gpu_stats = torch.cuda.get_device_properties(0)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
statistics = \
f"==((====))== Unsloth: Fast {model_patcher.__name__[4:-5]} patching release {__version__}\n"\
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform = {platform_system}.\n"\
f"O^O/ \_/ \\ Pytorch: {torch.__version__}. CUDA = {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit = {torch.version.cuda}.\n"\
f"\ / Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
f' "-____-" Free Apache license: http://github.com/unslothai/unsloth'
print(statistics)
# Warn about fast transfers
old_hf_transfer = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0")
if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "0") == "1":
print("Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!")
pass
# Return old flag
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
model_patcher.pre_patch()
get_statistics() # For debugging - we use a download counter to see if environments are not breaking
if dtype is None:
dtype = torch.float16 if not SUPPORTS_BFLOAT16 else torch.bfloat16
elif dtype == torch.bfloat16 and not SUPPORTS_BFLOAT16:
logger.warning_once("Device does not support bfloat16. Will change to float16.")
dtype = torch.float16
assert(dtype == torch.float16 or dtype == torch.bfloat16 or dtype == torch.float32)
# RoPE Scaling
model_config = AutoConfig.from_pretrained(model_name, token = token)
model_max_seq_length = model_config.max_position_embeddings
# Check if RoPE Scaling is even allowed
model_function = MODEL_FOR_CAUSAL_LM_MAPPING[model_config.__class__]
has_rope_scaling = False
try:
with open(inspect.getfile(model_function), "r") as file:
has_rope_scaling = "self.config.rope_scaling" in file.read()
except: pass
has_rope_scaling = True
# If max_seq_length is not specified, use maximum fron config
if max_seq_length is None:
max_seq_length = model_max_seq_length
pass
if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
rope_scaling = max_seq_length / model_max_seq_length
logger.warning_once(
f"Unsloth: {model_name} can only handle sequence lengths of at most "\
f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
f"{round(rope_scaling, 3)}, it can be magically be extended to "\
f"{max_seq_length}!"
)
# Warn RoPE scaling isn't allowed
if not has_rope_scaling:
raise RuntimeError(
"However, {model_name} doesn't support RoPE Scaling!\n"\
"Please file a feature request at https://github.com/unslothai/unsloth."
)
pass
rope_scaling = {"type": "linear", "factor": rope_scaling,}
# Add to kwargs
kwargs["rope_scaling"] = rope_scaling
pass
# We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
pre_check = check_nvidia()
bnb_config = None
if load_in_4bit:
bnb_config = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_use_double_quant = True,
bnb_4bit_quant_type = "nf4",
bnb_4bit_compute_dtype = dtype,
)
pass
# https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
# RoPE Scaling's max_position_embeddings must be updated
max_position_embeddings = max(max_seq_length, model_max_seq_length)
kwargs.pop("attn_implementation", None); # No need since we auto call it
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map = device_map,
torch_dtype = dtype,
quantization_config = bnb_config,
token = token,
max_position_embeddings = max_position_embeddings,
trust_remote_code = trust_remote_code,
attn_implementation = "eager",
**kwargs,
)
# Return old flag
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = old_hf_transfer
# We currently only support NVIDIA GPUs - AMD / Intel is a work in progress!
post_check = check_nvidia()
# Counteract saved tokenizers
tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
tokenizer = load_correct_tokenizer(
tokenizer_name = tokenizer_name,
model_max_length = max_position_embeddings,
padding_side = "right",
token = token,
trust_remote_code = trust_remote_code,
)
model, tokenizer = patch_tokenizer(model, tokenizer)
model = model_patcher.post_patch(model)
# Patch up QKV / O and MLP
for idx, layer in enumerate(model.model.layers):
layer.self_attn.apply_qkv = original_apply_qkv
layer.self_attn.apply_o = original_apply_o
pass
# Patch Trainer
from transformers.trainer import Trainer
try:
if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
Trainer._original_training_loop = inner_training_loop
else:
inner_training_loop = Trainer._original_training_loop
except:
raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
pass
if ((post_check - pre_check) >= 1).sum() > 1:
raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
import transformers.trainer
items_in_trainer = dir(transformers.trainer)
good_items = []
for item in items_in_trainer:
# TODO: Support Deepspeed
if item.startswith(("deepspeed", "xm", "met", "smp")): continue
if item in inner_training_loop: good_items.append(item)
pass
exec("from transformers.trainer import (" + ", ".join(x for x in good_items) + ")", globals())
start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
end = inner_training_loop.find("\n\n", start)
original_debug = inner_training_loop[start:end]
spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)
debug_info = """debug_info = \\
f"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = {args.world_size}\\n"\\
f" \\\\\\ /| Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,}\\n"\\
f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
logger.warning(debug_info)
import subprocess, re, gc, numpy as np
a = np.array([0,])
try:
a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
a = np.array([int(x.decode('utf-8'))/1024 for x in a])
except:
if not torch.cuda.is_available():
raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
if ((a - PRE_CHECK) >= 1).sum() > 1:
raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()"""
debug_info = debug_info.split('\n')
debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
debug_info = """n_total_devices = total_train_batch_size // \\
args.gradient_accumulation_steps // self._train_batch_size
if n_total_devices > 1:
logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!')
debug_info ="""
debug_info = debug_info.split('\n')
debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
inner_training_loop = inner_training_loop.replace("debug_info =", debug_info, 1)
front_spaces = re.match(r"[\t\s]{1,}", inner_training_loop).group(0)
inner_training_loop = re.sub(r"^" + front_spaces, "", inner_training_loop, flags = re.MULTILINE)
inner_training_loop = inner_training_loop.replace(
"train_dataloader = tpu_spmd_dataloader(train_dataloader)",
"raise RuntimeError('Unsloth: TPUs are not yet supported!')"
)
inner_training_loop = inner_training_loop.replace(
"self.accelerator.free_memory()",
"self.accelerator.free_memory()\n" + \
front_spaces + "if self.is_deepspeed_enabled:"\
"raise RuntimeError('Unsloth: Deepspeed is not yet supported!')\n", 1,
)
check_batches = """train_dataloader = self.get_train_dataloader()
ga = args.gradient_accumulation_steps
bsz = self._train_batch_size
total_batches = bsz * ga * args.world_size
n_total_devices = total_batches // ga // bsz
if n_total_devices > 1:
logger.warning_once('Unsloth currently does not support multi GPU setups - but we are working on it!')
divisor = n_total_devices / 1
bsz = self._train_batch_size = max(int(bsz / divisor), 1)
if total_batches // ga // bsz > 1:
divisor = n_total_devices / 1
ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
check_batches = check_batches.split('\n')
check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
inner_training_loop = inner_training_loop.replace(
"train_dataloader = self.get_train_dataloader()",
check_batches, 1,
)
inner_training_loop = inner_training_loop.replace(
"_inner_training_loop",
"_fast_inner_training_loop", 1,
)
exec(inner_training_loop, globals())
Trainer._inner_training_loop = _fast_inner_training_loop
inner_training_loop = inner_training_loop.replace(
"is_torch_tpu_available()",
"False",
)
if "n_total_devices >" not in inner_training_loop:
raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
pass
inner_training_loop = inner_training_loop.replace(
"is_sagemaker_mp_enabled()",
"False",
)
exec(inner_training_loop, globals())
Trainer._inner_training_loop = _fast_inner_training_loop
# Save max_seq_length
model.max_seq_length = max_position_embeddings
internal_model = model
while hasattr(internal_model, "model"):
internal_model.max_seq_length = max_position_embeddings
internal_model = internal_model.model
pass
internal_model.max_seq_length = max_position_embeddings
# We check the tokenizer first for errors
if fix_tokenizer:
tokenizer = check_tokenizer(
model = model,
tokenizer = tokenizer,
model_name = model_name,
model_max_length = max_position_embeddings,
padding_side = "right",
token = token,
)
pass
patch_saving_functions(tokenizer)
# Fix up config for transformers uploading PEFT
# Not necessary anymore since we require transformers>=4.37!
if False:
name = model.config._name_or_path
if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
name = name[:len(name) - len("-bnb-4bit")]
model.config.update({"_name_or_path" : name})
pass
pass
# Log Unsloth version for future fastpaths for inference
model.config.update({"unsloth_version" : __version__})
# Add save modules
patch_saving_functions(model)
Trainer._inner_training_loop = _fast_inner_training_loop
# Save tokenizer for inference purposes
tokenizer.padding_side = "left" # Force inference
internal_model = model
while hasattr(internal_model, "model"):
internal_model._saved_temp_tokenizer = tokenizer
internal_model = internal_model.model
pass
internal_model._saved_temp_tokenizer = tokenizer
return model, tokenizer
pass
@staticmethod
def post_patch(model):
# Patch model
layers = model.model.layers
# Torch.compile fails on embedding matrix??
# Workaround randomnly fixes it for torch versions < 2.
model.set_input_embeddings(torch.nn.Embedding.from_pretrained(model.get_input_embeddings().weight))
model.config.update({"unsloth_version" : __version__})
# We also do this for the lm_head
lm_head = torch.nn.Linear(1, 1, bias = None)
del lm_head.weight
lm_head.weight = model.get_output_embeddings().weight
lm_head.in_features = lm_head.weight.shape[1]
lm_head.out_features = lm_head.weight.shape[0]
model.lm_head = lm_head
# Also patch all dtypes - BnB seems to not allocate the correct type?
# BnB default dtype seems to be float16!
correct_dtype = lm_head.weight.dtype
for name, module in model.named_modules():
if isinstance(module, (Bnb_Linear4bit, Peft_Linear4bit)):
weight = module.weight
quant_state = weight.quant_state
if type(quant_state) is list:
# BnB seems to have float16 as default!
module.weight.quant_state[2] = correct_dtype # Cast to correct dtype
else:
# https://github.com/TimDettmers/bitsandbytes/pull/763/files
quant_state.dtype = correct_dtype
pass
pass
# Downcast RoPE embedding to correct data type
if (name.endswith("rotary_emb") or hasattr(module, "cos_cached")) \
and (module.cos_cached.dtype != correct_dtype):
module.cos_cached = module.cos_cached.to(correct_dtype)
module.sin_cached = module.sin_cached.to(correct_dtype)
pass
pass
pass
# Clear deleted GPU items
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
return model
pass
@staticmethod
def get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
layers_to_transform = None,
layers_pattern = None,
use_gradient_checkpointing = True,
random_state = 3407,
max_seq_length = 2048, # not used anymore
use_rslora = False,
modules_to_save = None,
init_lora_weights = True,
loftq_config = {},
temporary_location = "_unsloth_temporary_saved_buffers",
**kwargs,
):
transformers_set_seed(random_state)
if isinstance(model, PeftModelForCausalLM):
# Check if exactly the same and then pass through!
assert(hasattr(model, "peft_config"))
peft_config = model.peft_config["default"].to_dict()
check_parameters = [
"r", "lora_alpha", "lora_dropout",
"bias", "layers_to_transform", "layers_pattern",
"use_rslora", "init_lora_weights",
]
check_all = True
for param in check_parameters:
check_all = check_all and (peft_config[param] == eval(param))
pass
# Check save_modules
old_target_modules = list(peft_config["target_modules"])
modules_to_save = peft_config["modules_to_save"]
if modules_to_save is None: modules_to_save = {}
modules_to_save = list(modules_to_save)
old_target_modules += modules_to_save
# Combine all
new_target_modules = list(target_modules) + \
list(modules_to_save if modules_to_save is not None else [])
# Now check!
new_target_modules = set(new_target_modules)
check_all = check_all and (
len(set(old_target_modules) ^ new_target_modules) == 0
)
check_all = check_all and (
(loftq_config == {} or loftq_config is None) and \
(peft_config["loftq_config"] == {} or peft_config["loftq_config"] is None)
)
if check_all:
# Simply pass through!
logger.warning(
"Unsloth: Already have LoRA adapters! We shall skip this step."
)
# Offload!
# [TODO] First offload lm_head and embed_tokens to CPU (should be disk!!)
if "embed_tokens" in new_target_modules:
print("Unsloth: Casting embed_tokens to float32")
model.model.model.embed_tokens.modules_to_save.default\
.to(device = "cuda:0", dtype = torch.float32, non_blocking = True)
model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
# [TODO] Move old embed_tokens to CPU - should be disk!
model.model.model.embed_tokens.original_module\
.to(device = "cpu", non_blocking = True)
model.model.model.embed_tokens.original_module.requires_grad_(False)
pass
if "lm_head" in new_target_modules:
print("Unsloth: Casting lm_head to float32")
model.model.lm_head.modules_to_save.default\
.to(device = "cuda:0", dtype = torch.float32, non_blocking = True)
model.model.lm_head.modules_to_save.default.requires_grad_(True)
# [TODO] Move old lm_head to CPU - should be disk!
model.model.lm_head.original_module\
.to(device = "cpu", non_blocking = True)
model.model.lm_head.original_module.requires_grad_(False)
pass
return model
else:
raise TypeError(
"Unsloth: Your model already has LoRA adapters. Your new parameters are different."
)
pass
pass
if loftq_config is None: loftq_config = {}
signature = str(inspect.signature(LoraConfig))
SUPPORTS_LOFTQ = "loftq_config" in signature
SUPPORTS_RSLORA = "use_rslora" in signature
assert(max_seq_length <= model.max_seq_length)
if lora_dropout != 0:
logger.warning_once(
f"Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = {lora_dropout}.\n"\
f"Unsloth will patch all other layers, except LoRA matrices, causing a performance hit."
)
pass
if bias != "none":
logger.warning_once(
f"Unsloth: bias = `none` is supported for fast patching. You are using bias = {bias}.\n"\
f"Unsloth will patch all other layers, except LoRA matrices, causing a performance hit."
)
pass
if not (type(init_lora_weights) is bool or \
init_lora_weights == "gaussian" or init_lora_weights == "loftq"):
raise ValueError(
'Unsloth: `init_lora_weights` must be either [True, False, "gaussian", "loftq"].'
)
pass
if init_lora_weights == "loftq":
if not SUPPORTS_LOFTQ:
import peft
raise RuntimeError(
f"Unsloth: Your PEFT version of {peft.__version__} does not support LoftQ init.\n"\
"Please install PEFT 0.7.2 or higher.\n"\
"You can also install from source: `pip install git+https://github.com/huggingface/peft.git"
)
pass
if loftq_config == {}:
from peft import LoftQConfig
logger.warning_once(
f"Unsloth: init_lora_weights = `loftq` is set, but `loftq_config` is None.\n"\
f"We shall use `loftq_config = LoftQConfig(loftq_bits = 4, loftq_iter = 1)`."
)
loftq_config = LoftQConfig(loftq_bits = 4, loftq_iter = 1)
pass
if hasattr(model.config, "quantization_config"):
raise ValueError(
"Unsloth: You are using `loftq` init, yet `load_in_4bit = True` was set.\n"\
"Reload your model without any quantization by setting `load_in_4bit = False`."
)
pass
pass
assert(type(use_rslora) is bool)
if use_rslora:
if not SUPPORTS_RSLORA:
# We manually check for PEFT
import peft
raise RuntimeError(
f"Unsloth: Your PEFT version of {peft.__version__} does not support `use_rslora`.\n"\
"Please install PEFT 0.7.2 or higher.\n"\
"You can also install from source: `pip install git+https://github.com/huggingface/peft.git"
)
pass
pass
accepted_modules = frozenset(("q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",),)
model.config.update({"unsloth_version" : __version__})
if type(modules_to_save) is tuple:
modules_to_save = list(modules_to_save)
pass
train_lm_head = False
train_embed_tokens = False
final_modules = []
for module in target_modules:
if module == "lm_head":
# logger.warning_once(
# "Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`. "\
# "Luckily, we shall do it for you!"
# )
train_lm_head = True
if modules_to_save is None: modules_to_save = ["lm_head"]
else: modules_to_save.append("lm_head")
elif module == "embed_tokens":
# logger.warning_once(
# "Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`. "\
# "Luckily, we shall do it for you!"
# )
train_embed_tokens = True
if modules_to_save is None: modules_to_save = ["embed_tokens"]
else: modules_to_save.append("embed_tokens")
else:
assert(module in accepted_modules)
final_modules.append(module)
pass
# Check if we added new tokens!
if hasattr(model, "_need_to_train_embeddings"):
if not train_lm_head or not train_embed_tokens:
print(
"Unsloth: You added new tokens but did not specify if you wanted to "\
"train the lm_head and embed_tokens.\nWe must turn it on for you."
)
train_lm_head = True
train_embed_tokens = True
if modules_to_save is None: modules_to_save = ["embed_tokens"]
else: modules_to_save.append("embed_tokens")
if modules_to_save is None: modules_to_save = ["lm_head"]
else: modules_to_save.append("lm_head")
pass
pass
# Check for Llama-3
# if hasattr(model._saved_temp_tokenizer, "_using_llama3_template"):
# if not train_embed_tokens and not train_lm_head:
# raise RuntimeError("")
# First fix untrained tokens
# Wrong - can cause reserved tokens to pop out!!
# if train_embed_tokens or train_lm_head:
# fix_untrained_tokens(model, eps = 1e-16)
# pass
# Check modules_to_save
if modules_to_save is not None:
for module in modules_to_save:
if module == "lm_head":
train_lm_head = True
elif module == "embed_tokens":
train_embed_tokens = True
else:
raise TypeError(
f"Unsloth: Module = {module} is not allowed. Only 'lm_head' and 'embed_tokens' is allowed."
)
pass
pass
if isinstance(modules_to_save, (tuple, list)):
modules_to_save = list(set(modules_to_save))
pass
# Get LoRA
arguments = dict(
r = r,
lora_alpha = lora_alpha,
target_modules = final_modules,
lora_dropout = lora_dropout,
bias = bias,
task_type = TaskType.CAUSAL_LM,
layers_to_transform = layers_to_transform,
init_lora_weights = init_lora_weights,
loftq_config = loftq_config,
use_rslora = use_rslora,
modules_to_save = modules_to_save,
**kwargs,
)
if not SUPPORTS_LOFTQ: del arguments["loftq_config"]
if not SUPPORTS_RSLORA: del arguments["use_rslora"]
_saved_temp_tokenizer = model._saved_temp_tokenizer
lora_config = LoraConfig(**arguments)
# First offload lm_head and embed_tokens to disk
input_embeddings_device = model. get_input_embeddings().weight.device
output_embeddings_device = model.get_output_embeddings().weight.device
if use_gradient_checkpointing == "unsloth":
if train_embed_tokens:
print("Unsloth: Offloading input_embeddings to disk to save VRAM")
offload_input_embeddings(model, temporary_location)
pass
# Remove old items to save VRAM
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
pass
if train_lm_head:
print("Unsloth: Offloading output_embeddings to disk to save VRAM")
offload_output_embeddings(model, temporary_location)
pass
# Remove old items to save VRAM
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
pass
pass
model = _get_peft_model(model, lora_config)
model._saved_temp_tokenizer = _saved_temp_tokenizer
model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing)
# Now patch lm_head and embed_tokens
if train_embed_tokens:
print("Unsloth: Casting embed_tokens to float32")
assert(hasattr(model.model.model.embed_tokens, "modules_to_save"))
model.model.model.embed_tokens.modules_to_save.default\
.to(device = "cuda:0", dtype = torch.float32, non_blocking = True)
model.model.model.embed_tokens.modules_to_save.default.requires_grad_(True)
pass
if train_lm_head:
print("Unsloth: Casting lm_head to float32")
assert(hasattr(model.model.lm_head, "modules_to_save"))
model.model.lm_head.modules_to_save.default\
.to(device = "cuda:0", dtype = torch.float32, non_blocking = True)
model.model.lm_head.modules_to_save.default.requires_grad_(True)
pass
# Patch tokenizer to pad to the right
internal_model = model
while hasattr(internal_model, "model"):
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "right"
pass
internal_model = internal_model.model
pass
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "right"
pass
# Clear deleted GPU items
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
pass
return model
pass
@staticmethod
def patch_peft_model(
model,
use_gradient_checkpointing = True,
):
if not isinstance(model, PeftModelForCausalLM):
raise TypeError(
"Unsloth: Your model needs to call `.get_peft_model` first!"
)
pass
# Get activation function
model_type = model.config.model_type
if model_type == "llama": apply_lora_mlp = apply_lora_mlp_swiglu
elif model_type == "mistral": apply_lora_mlp = apply_lora_mlp_swiglu
elif model_type == "qwen2": apply_lora_mlp = apply_lora_mlp_swiglu
elif model_type == "gemma": apply_lora_mlp = apply_lora_mlp_geglu_approx
elif model_type == "gemma2": apply_lora_mlp = apply_lora_mlp_geglu_approx
else:
raise NotImplementedError(f"Unsloth: {model_type} is not yet implemented!")
pass
model = prepare_model_for_kbit_training(
model,
use_gradient_checkpointing = use_gradient_checkpointing,
use_reentrant = True,
)
# Fix up config for transformers uploading PEFT
for active_adapter in model.peft_config.keys():
# Not necessary since we requires transformers >= 4.37
if False:
name = model.peft_config[active_adapter].base_model_name_or_path
if name.startswith("unsloth/") and name.endswith("-bnb-4bit"):
name = name[:len(name) - len("-bnb-4bit")]
model.peft_config[active_adapter].base_model_name_or_path = name
pass
# Add revision to enable future fast inference paths
model.peft_config[active_adapter].revision = f"unsloth"
pass
from transformers.trainer import Trainer
if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
raise RuntimeError(
'Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so '\
'enabling it will require much more work, so we have to prioritize. Please understand!\n'\
'We do have a separate beta version, which you can contact us about!\n'\
'Thank you for your understanding and we appreciate it immensely!'
)
pass
# Fix loftq issues
# loftq_config must not = None, but rather {}
all_configs = model.peft_config
for key, current_config in all_configs.items():
if hasattr(current_config, "loftq_config") and current_config.loftq_config is None:
new_args = current_config.__dict__
new_args["loftq_config"] = {}
current_config = current_config.__class__(**new_args)
all_configs[key] = current_config
pass
pass
# Do patching
n_mlp = 0
n_qkv = 0
n_o = 0
import types
active_adapter = model.active_adapters[0] if \
hasattr(model, "active_adapters") else model.active_adapter
# Get dropout and bias
lora_dropout = model.peft_config[active_adapter].lora_dropout
bias = model.peft_config[active_adapter].bias
if lora_dropout == 0 and bias == "none":
for idx, layer in enumerate(model.model.model.layers):
# MLP patching
gate_proj = layer.mlp.gate_proj
up_proj = layer.mlp. up_proj
down_proj = layer.mlp.down_proj
if hasattr(gate_proj, "lora_A") and \
hasattr( up_proj, "lora_A") and \
hasattr(down_proj, "lora_A") and \
(getattr(gate_proj, "base_layer", gate_proj).bias is None) and \
(getattr( up_proj, "base_layer", up_proj).bias is None) and \
(getattr(down_proj, "base_layer", down_proj).bias is None) and \
(len(getattr(gate_proj, "lora_magnitude_vector", []) or []) == 0) and \
(len(getattr( up_proj, "lora_magnitude_vector", []) or []) == 0) and \
(len(getattr(down_proj, "lora_magnitude_vector", []) or []) == 0):
# https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
n_mlp += 1
else:
logger.warning_once(
"Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters\n"\
"are not enabled or a bias term (like in Qwen) is used."
)
pass
# QKV attention patching
q_proj = layer.self_attn.q_proj
k_proj = layer.self_attn.k_proj
v_proj = layer.self_attn.v_proj
if hasattr(q_proj, "lora_A") and \
hasattr(k_proj, "lora_A") and \
hasattr(v_proj, "lora_A") and \
(getattr(q_proj, "base_layer", q_proj).bias is None) and \
(getattr(k_proj, "base_layer", k_proj).bias is None) and \
(getattr(v_proj, "base_layer", v_proj).bias is None) and \
(len(getattr(q_proj, "lora_magnitude_vector", []) or []) == 0) and \
(len(getattr(k_proj, "lora_magnitude_vector", []) or []) == 0) and \
(len(getattr(v_proj, "lora_magnitude_vector", []) or []) == 0):
layer.self_attn.apply_qkv = apply_lora_qkv
n_qkv += 1
else:
if model_type != "qwen2":
logger.warning_once(
"Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters\n"\
"are not enabled or a bias term (like in Qwen) is used."
)
pass
pass
# O attention patching
o_proj = layer.self_attn.o_proj
if hasattr(o_proj, "lora_A") and \
(getattr(o_proj, "base_layer", o_proj).bias is None) and \
(len(getattr(o_proj, "lora_magnitude_vector", []) or []) == 0):
layer.self_attn.apply_o = apply_lora_o
n_o += 1
else:
logger.warning_once(
"Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters\n"\
"are not enabled or a bias term (like in Qwen) is used."
)
pass
pass
pass
logger.warning_once(
f"Unsloth {__version__} patched {len(model.model.model.layers)} layers with "\
f"{n_qkv} QKV layers, {n_o} O layers and {n_mlp} MLP layers.",
)
patch_saving_functions(model)
# Patch cross entropy loss labels
# Fixes https://github.com/unslothai/unsloth/issues/10
max_seq_length = model.max_seq_length
extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda:0")
model.model.extra_ignored_labels = extra_ignored_labels
internal_model = model
while hasattr(internal_model, "model"):
internal_model.max_seq_length = max_seq_length
internal_model = internal_model.model
pass
internal_model.max_seq_length = max_seq_length
# Patch tokenizer to pad to the right
internal_model = model
while hasattr(internal_model, "model"):
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "right"
pass
internal_model = internal_model.model
pass
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "right"
pass
# Clear deleted GPU items
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
pass
return model
pass
@staticmethod
def for_inference(model):
# if model.config.model_type == "qwen2":
# FastLlamaModel.for_training(model)
# return
# pass
internal_model = model
internal_model.gradient_checkpointing = False
internal_model.training = False
while hasattr(internal_model, "model"):
internal_model = internal_model.model
internal_model.gradient_checkpointing = False
internal_model.training = False
pass
# Also check if lm_head / embeddings are trained
internal_model = model
while not hasattr(internal_model, "lm_head"):
internal_model = internal_model.model
pass
lm_head = internal_model.lm_head.weight
device_type = lm_head.device.type
dtype = model.config.torch_dtype
if type(dtype) is str:
if dtype == "float16": dtype = torch.float16
elif dtype == "bfloat16": dtype = torch.bfloat16
pass
# Wrap model.generate
if model.generate.__name__ != "_fast_generate":
model._unwrapped_old_generate = model.generate
model.generate = _wrap_fast_inference(model.generate, device_type, dtype, model)
pass
# Patch tokenizer to pad to the left
internal_model = model
while hasattr(internal_model, "model"):
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "left"
pass
internal_model = internal_model.model
pass
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "left"
pass
pass
@staticmethod
def for_training(model, use_gradient_checkpointing = True):
internal_model = model
internal_model.gradient_checkpointing = use_gradient_checkpointing
internal_model.training = True
# Delete all fast inference loras
for param in model.parameters():
if hasattr(param, "_fast_lora"):
del param._fast_lora
pass
while hasattr(internal_model, "model"):
internal_model = internal_model.model
internal_model.gradient_checkpointing = use_gradient_checkpointing
internal_model.training = True
pass
# Also revert model.generate
if hasattr(model, "_unwrapped_old_generate"):
model.generate = model._unwrapped_old_generate
del model._unwrapped_old_generate
pass
# Patch tokenizer to pad to the right
internal_model = model
while hasattr(internal_model, "model"):
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "right"
pass
internal_model = internal_model.model
pass
if hasattr(internal_model, "_saved_temp_tokenizer"):
internal_model._saved_temp_tokenizer.padding_side = "right"
pass
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .llama import FastLlamaModel, logger
from .mistral import FastMistralModel
from .qwen2 import FastQwen2Model
from transformers import AutoConfig
from transformers import __version__ as transformers_version
from peft import PeftConfig, PeftModel
from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER
import os
# https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
from packaging.version import Version
transformers_version = Version(transformers_version)
SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
SUPPORTS_GEMMA = transformers_version >= Version("4.38")
SUPPORTS_GEMMA2 = transformers_version >= Version("4.42")
SUPPORTS_LLAMA31 = transformers_version >= Version("4.43.1")
if SUPPORTS_GEMMA:
from .gemma import FastGemmaModel
if SUPPORTS_GEMMA2:
from .gemma2 import FastGemma2Model
pass
def _get_model_name(model_name, load_in_4bit = True):
if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
logger.warning_once(
f"Unsloth: Your transformers version of {transformers_version} does not support native "\
f"4bit loading.\nThe minimum required version is 4.37.\n"\
f'Try `pip install --upgrade "transformers>=4.37"`\n'\
f"to obtain the latest transformers build, then restart this session.\n"\
f"For now, we shall load `{model_name}` instead (still 4bit, just slower downloading)."
)
elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
# logger.warning_once(
# f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
# f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
# )
model_name = new_model_name
elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
# logger.warning_once(
# f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
# f"We shall load `{new_model_name}` for 4x faster loading."
# )
model_name = new_model_name
pass
return model_name
pass
class FastLanguageModel(FastLlamaModel):
@staticmethod
def from_pretrained(
model_name = "unsloth/llama-3-8b-bnb-4bit",
max_seq_length = None,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
trust_remote_code = False,
use_gradient_checkpointing = "unsloth",
resize_model_vocab = None,
revision = None,
*args, **kwargs,
):
if token is None and "HF_TOKEN" in os.environ:
token = os.environ["HF_TOKEN"]
if token is None and "HUGGINGFACE_TOKEN" in os.environ:
token = os.environ["HUGGINGFACE_TOKEN"]
old_model_name = model_name
model_name = _get_model_name(model_name, load_in_4bit)
# First check if it's a normal model via AutoConfig
from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
was_disabled = are_progress_bars_disabled()
disable_progress_bars()
try:
model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
is_model = True
except:
is_model = False
try:
peft_config = PeftConfig .from_pretrained(model_name, token = token, revision = revision)
is_peft = True
except:
is_peft = False
# Cannot be both!
if is_model and is_peft:
raise RuntimeError(
"Unsloth: Your repo has a LoRA adapter and a base model.\n"\
"You have 2 files `config.json` and `adapter_config.json`.\n"\
"We must only allow one config file.\n"\
"Please separate the LoRA and base models to 2 repos."
)
elif not is_model and not is_peft:
raise RuntimeError(
f"Unsloth: `{model_name}` is not a base model or a PEFT model.\n"\
"We could not locate a `config.json` or `adapter_config.json` file.\n"\
"Are you certain the model name is correct? Does it actually exist?"
)
pass
# Get base model for PEFT:
if is_peft:
# Check base model again for PEFT
model_name = _get_model_name(peft_config.base_model_name_or_path, load_in_4bit)
model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
pass
if not was_disabled: enable_progress_bars()
model_type = model_config.model_type
if model_type == "llama":
scaling_type = None
if getattr(model_config, "rope_scaling", None) is not None:
scaling_type1 = model_config.rope_scaling.get("type", None)
scaling_type2 = model_config.rope_scaling.get("rope_type", None)
scaling_type = scaling_type1 if scaling_type1 is not None else scaling_type2
pass
if scaling_type == "llama3" and not SUPPORTS_LLAMA31:
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Llama 3.1.\n"\
f"The minimum required version is 4.43.1\n"\
f'Try `pip install --upgrade "transformers>=4.43.1"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
dispatch_model = FastLlamaModel
elif model_type == "mistral": dispatch_model = FastMistralModel
elif model_type == "gemma":
if not SUPPORTS_GEMMA:
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
f"The minimum required version is 4.38.\n"\
f'Try `pip install --upgrade "transformers>=4.38"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
dispatch_model = FastGemmaModel
elif model_type == "gemma2":
if not SUPPORTS_GEMMA2:
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
f"The minimum required version is 4.42.3.\n"\
f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
dispatch_model = FastGemma2Model
elif model_type == "qwen2":
dispatch_model = FastQwen2Model
else:
raise NotImplementedError(
f"Unsloth: {model_name} not supported yet!\n"\
"Make an issue to https://github.com/unslothai/unsloth!",
)
pass
# Check if this is local model since the tokenizer gets overwritten
if os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \
os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \
os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")):
tokenizer_name = old_model_name
else:
tokenizer_name = None
pass
model, tokenizer = dispatch_model.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
model_patcher = dispatch_model,
tokenizer_name = tokenizer_name,
trust_remote_code = trust_remote_code,
revision = revision if not is_peft else None,
*args, **kwargs,
)
if resize_model_vocab is not None:
model.resize_token_embeddings(resize_model_vocab)
pass
# In case the model supports tagging, add the unsloth tag.
if hasattr(model, "add_model_tags"):
model.add_model_tags(["unsloth",])
pass
if hasattr(tokenizer, "add_model_tags"):
tokenizer.add_model_tags(["unsloth",])
pass
if load_in_4bit:
# Fix up bitsandbytes config
quantization_config = \
{
# Sometimes torch_dtype is not a string!!
"bnb_4bit_compute_dtype" : model.config.to_dict()["torch_dtype"],
"bnb_4bit_quant_type" : "nf4",
"bnb_4bit_use_double_quant" : True,
"llm_int8_enable_fp32_cpu_offload" : False,
"llm_int8_has_fp16_weight" : False,
"llm_int8_skip_modules" : None,
"llm_int8_threshold" : 6.0,
"load_in_4bit" : True,
"load_in_8bit" : False,
"quant_method" : "bitsandbytes",
}
model.config.update({"quantization_config" : quantization_config})
pass
if is_peft:
# From https://github.com/huggingface/peft/issues/184
# Now add PEFT adapters
model.enable_input_require_grads()
model = PeftModel.from_pretrained(
model,
old_model_name,
token = token,
revision = revision,
is_trainable = True,
)
# Patch it as well!
model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
pass
return model, tokenizer
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"INT_TO_FLOAT_MAPPER",
"FLOAT_TO_INT_MAPPER",
]
__INT_TO_FLOAT_MAPPER = \
{
"unsloth/mistral-7b-bnb-4bit" : (
"unsloth/mistral-7b",
"mistralai/Mistral-7B-v0.1",
),
"unsloth/llama-2-7b-bnb-4bit" : (
"unsloth/llama-2-7b",
"meta-llama/Llama-2-7b-hf",
),
"unsloth/llama-2-13b-bnb-4bit" : (
"unsloth/llama-2-13b",
"meta-llama/Llama-2-13b-hf",
),
"unsloth/codellama-34b-bnb-4bit" : (
"codellama/CodeLlama-34b-hf",
),
"unsloth/zephyr-sft-bnb-4bit" : (
"unsloth/zephyr-sft",
"HuggingFaceH4/mistral-7b-sft-beta",
),
"unsloth/tinyllama-bnb-4bit" : (
"unsloth/tinyllama",
"TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
),
"unsloth/tinyllama-chat-bnb-4bit" : (
"unsloth/tinyllama-chat",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
),
"unsloth/mistral-7b-instruct-v0.1-bnb-4bit" : (
"unsloth/mistral-7b-instruct-v0.1",
"mistralai/Mistral-7B-Instruct-v0.1",
),
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit" : (
"unsloth/mistral-7b-instruct-v0.2",
"mistralai/Mistral-7B-Instruct-v0.2",
),
"unsloth/llama-2-7b-chat-bnb-4bit" : (
"unsloth/llama-2-7b-chat",
"meta-llama/Llama-2-7b-chat-hf",
),
"unsloth/llama-2-7b-chat-bnb-4bit" : (
"unsloth/llama-2-7b-chat",
"meta-llama/Llama-2-7b-chat-hf",
),
"unsloth/codellama-7b-bnb-4bit" : (
"unsloth/codellama-7b",
"codellama/CodeLlama-7b-hf",
),
"unsloth/codellama-13b-bnb-4bit" : (
"codellama/CodeLlama-13b-hf",
),
"unsloth/yi-6b-bnb-4bit" : (
"unsloth/yi-6b",
"01-ai/Yi-6B",
),
"unsloth/solar-10.7b-bnb-4bit" : (
"upstage/SOLAR-10.7B-v1.0",
),
"unsloth/gemma-7b-bnb-4bit" : (
"unsloth/gemma-7b",
"google/gemma-7b",
),
"unsloth/gemma-2b-bnb-4bit" : (
"unsloth/gemma-2b",
"google/gemma-2b",
),
"unsloth/gemma-7b-it-bnb-4bit" : (
"unsloth/gemma-7b-it",
"google/gemma-7b-it",
),
"unsloth/gemma-2b-bnb-4bit" : (
"unsloth/gemma-2b-it",
"google/gemma-2b-it",
),
"unsloth/mistral-7b-v0.2-bnb-4bit" : (
"unsloth/mistral-7b-v0.2",
"alpindale/Mistral-7B-v0.2-hf",
),
"unsloth/gemma-1.1-2b-it-bnb-4bit" : (
"unsloth/gemma-1.1-2b-it",
"google/gemma-1.1-2b-it",
),
"unsloth/gemma-1.1-7b-it-bnb-4bit" : (
"unsloth/gemma-1.1-7b-it",
"google/gemma-1.1-7b-it",
),
"unsloth/Starling-LM-7B-beta-bnb-4bit" : (
"unsloth/Starling-LM-7B-beta",
"Nexusflow/Starling-LM-7B-beta",
),
"unsloth/Hermes-2-Pro-Mistral-7B-bnb-4bit" : (
"unsloth/Hermes-2-Pro-Mistral-7B",
"NousResearch/Hermes-2-Pro-Mistral-7B",
),
"unsloth/OpenHermes-2.5-Mistral-7B-bnb-4bit" : (
"unsloth/OpenHermes-2.5-Mistral-7B",
"teknium/OpenHermes-2.5-Mistral-7B",
),
"unsloth/codegemma-2b-bnb-4bit" : (
"unsloth/codegemma-2b",
"google/codegemma-2b",
),
"unsloth/codegemma-7b-bnb-4bit" : (
"unsloth/codegemma-7b",
"google/codegemma-7b",
),
"unsloth/codegemma-7b-it-bnb-4bit" : (
"unsloth/codegemma-7b-it",
"google/codegemma-7b-it",
),
"unsloth/llama-3-8b-bnb-4bit" : (
"unsloth/llama-3-8b",
"meta-llama/Meta-Llama-3-8B",
),
"unsloth/llama-3-8b-Instruct-bnb-4bit" : (
"unsloth/llama-3-8b-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct",
),
"unsloth/llama-3-70b-bnb-4bit" : (
"meta-llama/Meta-Llama-3-70B",
),
"unsloth/llama-3-70b-Instruct-bnb-4bit" : (
"meta-llama/Meta-Llama-3-70B-Instruct",
),
"unsloth/Phi-3-mini-4k-instruct-bnb-4bit" : (
"unsloth/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-mini-4k-instruct",
),
"unsloth/mistral-7b-v0.3-bnb-4bit" : (
"unsloth/mistral-7b-v0.3",
"mistralai/Mistral-7B-v0.3",
),
"unsloth/mistral-7b-instruct-v0.3-bnb-4bit" : (
"unsloth/mistral-7b-instruct-v0.3",
"mistralai/Mistral-7B-Instruct-v0.3",
),
"unsloth/Phi-3-medium-4k-instruct-bnb-4bit" : (
"unsloth/Phi-3-medium-4k-instruct",
"microsoft/Phi-3-medium-4k-instruct",
),
"unsloth/Qwen2-0.5B-bnb-4bit" : (
"unsloth/Qwen2-0.5B",
"Qwen/Qwen2-0.5B",
),
"unsloth/Qwen2-0.5B-Instruct-bnb-4bit" : (
"unsloth/Qwen2-0.5B-Instruct",
"Qwen/Qwen2-0.5B-Instruct",
),
"unsloth/Qwen2-1.5B-bnb-4bit" : (
"unsloth/Qwen2-1.5B",
"Qwen/Qwen2-1.5B",
),
"unsloth/Qwen2-1.5B-Instruct-bnb-4bit" : (
"unsloth/Qwen2-1.5B-Instruct",
"Qwen/Qwen2-1.5B-Instruct",
),
"unsloth/Qwen2-7B-bnb-4bit" : (
"unsloth/Qwen2-7B",
"Qwen/Qwen2-7B",
),
"unsloth/Qwen2-7B-Instruct-bnb-4bit" : (
"unsloth/Qwen2-7B-Instruct",
"Qwen/Qwen2-7B-Instruct",
),
"unsloth/Qwen2-70B-bnb-4bit" : (
"Qwen/Qwen2-70B",
),
"unsloth/Qwen2-70B-Instruct-bnb-4bit" : (
"Qwen/Qwen2-70B-Instruct",
),
"mistralai/Codestral-22B-v0.1" : (
"mistral-community/Codestral-22B-v0.1",
),
"unsloth/gemma-2-9b-bnb-4bit" : (
"unsloth/gemma-2-9b",
"google/gemma-2-9b",
),
"unsloth/gemma-2-27b-bnb-4bit" : (
"unsloth/gemma-2-27b",
"google/gemma-2-27b",
),
"unsloth/gemma-2-9b-it-bnb-4bit" : (
"unsloth/gemma-2-9b-it",
"google/gemma-2-9b-it",
),
"unsloth/gemma-2-27b-it-bnb-4bit" : (
"unsloth/gemma-2-27b-it",
"google/gemma-2-27b-it",
),
"unsloth/Phi-3-mini-4k-instruct-v0-bnb-4bit" : ( # Old Phi pre July
"unsloth/Phi-3-mini-4k-instruct-v0",
),
"unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit" : ( # New 12b Mistral models
"unsloth/Mistral-Nemo-Instruct-2407",
"mistralai/Mistral-Nemo-Instruct-2407",
),
"unsloth/Mistral-Nemo-Base-2407-bnb-4bit" : ( # New 12b Mistral models
"unsloth/Mistral-Nemo-Base-2407",
"mistralai/Mistral-Nemo-Base-2407",
),
"unsloth/Meta-Llama-3.1-8B-bnb-4bit" : (
"unsloth/Meta-Llama-3.1-8B",
"meta-llama/Meta-Llama-3.1-8B",
),
"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" : (
"unsloth/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
),
"unsloth/Meta-Llama-3.1-70B-bnb-4bit" : (
"meta-llama/Meta-Llama-3.1-70B",
),
"unsloth/Meta-Llama-3.1-405B-bnb-4bit" : (
"meta-llama/Meta-Llama-3.1-405B",
),
"unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit" : (
"meta-llama/Meta-Llama-3.1-405B-Instruct",
),
"unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit" : (
"meta-llama/Meta-Llama-3.1-70B-Instruct",
),
"unsloth/Mistral-Large-Instruct-2407-bnb-4bit" : (
"mistralai/Mistral-Large-Instruct-2407",
),
}
INT_TO_FLOAT_MAPPER = {}
FLOAT_TO_INT_MAPPER = {}
for key, values in __INT_TO_FLOAT_MAPPER.items():
INT_TO_FLOAT_MAPPER[key] = values[0]
for value in values:
FLOAT_TO_INT_MAPPER[value] = key
pass
# Get lowercased
lowered_key = key.lower()
INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower()
for value in values:
FLOAT_TO_INT_MAPPER[value.lower()] = lowered_key
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .llama import *
import os
from ._utils import __version__
from .llama import (
LlamaRotaryEmbedding,
LlamaLinearScalingRotaryEmbedding,
)
from transformers.models.mistral.modeling_mistral import (
MistralAttention,
MistralDecoderLayer,
MistralModel,
MistralForCausalLM,
)
# For Pytorch 2.1.1
try:
from transformers.models.mistral.modeling_mistral import (
MistralSdpaAttention,
MistralFlashAttention2,
)
except:
MistralSdpaAttention = MistralAttention
MistralFlashAttention2 = MistralAttention
pass
def MistralAttention_fast_forward(
self,
hidden_states: torch.Tensor,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
padding_mask: Optional[torch.LongTensor] = None,
*args, **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# Clear inference
if hasattr(self, "paged_attention"):
del self.paged_attention_K
del self.paged_attention_V
del self.paged_attention
del self.temp_QA
del self.temp_KV
del self.RH_Q
del self.attention
pass
bsz, q_len, _ = hidden_states.size()
n_heads = self.num_heads
n_groups = self.num_key_value_groups
n_kv_heads = self.num_key_value_heads
head_dim = self.head_dim
assert(n_kv_heads * n_groups == n_heads)
Q, K, V = self.apply_qkv(self, hidden_states)
Q = Q.view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
V = V.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
kv_seq_len = K.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
# Extend RoPE dynamically to fit in VRAM
self.rotary_emb.extend_rope_embedding(V, seq_len = kv_seq_len)
if position_ids is None:
cos = self.rotary_emb.cos_cached
sin = self.rotary_emb.sin_cached
Q, K = fast_rope_embedding(Q, K, cos, sin)
else:
cos, sin = self.rotary_emb(V, seq_len = kv_seq_len)
Q, K = inplace_rope_embedding(Q, K, cos, sin, position_ids)
pass
if past_key_value is not None:
K = torch.cat([past_key_value[0], K], dim = 2)
V = torch.cat([past_key_value[1], V], dim = 2)
pass
past_key_value = (K, V) if use_cache else None
# Attention module
if (not HAS_FLASH_ATTENTION and attention_mask is None):
# Xformers memory efficient attention
Q = Q.transpose(1, 2)
K = K.transpose(1, 2)
V = V.transpose(1, 2)
K_M = V_M = bsz * kv_seq_len
Q_M = bsz * q_len
has_swa = isinstance(causal_mask, xformers.attn_bias.BlockDiagonalCausalMask)
# Group query attention
K = K .view(bsz, kv_seq_len, n_kv_heads, 1, head_dim)
V = V .view(bsz, kv_seq_len, n_kv_heads, 1, head_dim)
K = K.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
V = V.expand(bsz, kv_seq_len, n_kv_heads, n_groups, head_dim)
if hidden_states.requires_grad:
K = K.reshape(bsz, kv_seq_len, n_heads, head_dim)
V = V.reshape(bsz, kv_seq_len, n_heads, head_dim)
if has_swa:
Q = Q.view(1, Q_M, n_heads, head_dim)
K = K.view(1, K_M, n_heads, head_dim)
V = V.view(1, V_M, n_heads, head_dim)
pass
else:
# Xformers does support the forward pass though
Q = Q.view(bsz, q_len, n_kv_heads, n_groups, head_dim)
if has_swa:
Q = Q.view(1, Q_M, n_kv_heads, n_groups, head_dim)
K = K.view(1, K_M, n_kv_heads, n_groups, head_dim)
V = V.view(1, V_M, n_kv_heads, n_groups, head_dim)
pass
pass
A = xformers_attention(Q, K, V, attn_bias = causal_mask)
A = A.view(bsz, q_len, n_heads, head_dim)
elif HAS_FLASH_ATTENTION and attention_mask is None:
Q = Q.transpose(1, 2)
K = K.transpose(1, 2)
V = V.transpose(1, 2)
sw = getattr(self.config, "sliding_window", None)
sw = kv_seq_len if (sw is None or sw == "null") else sw
window = (-1, -1) if (kv_seq_len <= sw) else (sw, sw)
A = flash_attn_func(Q, K, V, causal = True, window_size = window)
else:
# Grouped query attention
# if n_groups != 1:
K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, kv_seq_len, head_dim)
K = K.reshape(bsz, n_heads, kv_seq_len, head_dim)
V = V.reshape(bsz, n_heads, kv_seq_len, head_dim)
# pass
# Must be contiguous or else results are False!
# https://github.com/pytorch/pytorch/issues/112577
Q, K, V = Q.contiguous(), K.contiguous(), V.contiguous()
# Needs (batch_size, n_heads, seq_len, head_dim)
# is_casual and attention_mask must not be both set!
A = scaled_dot_product_attention(Q, K, V, attn_mask = attention_mask, is_causal = False)
# Go back to (batch_size, seq_len, n_heads, head_dim)
A = A.transpose(1, 2).contiguous()
pass
attn_output = A.reshape(bsz, q_len, n_heads*head_dim)
attn_output = self.apply_o(self, attn_output)
attn_weights = None
return attn_output, attn_weights, past_key_value
pass
def MistralForCausalLM_fast_forward(
self,
input_ids: torch.LongTensor = None,
causal_mask: Optional[xformers.attn_bias.BlockDiagonalCausalMask] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
*args, **kwargs,
) -> Union[Tuple, CausalLMOutputWithPast]:
if causal_mask is None and past_key_values is None:
bsz, q_len = input_ids.shape
sliding_window = getattr(self.config, "sliding_window", None)
if sliding_window is None or sliding_window == "null" or sliding_window <= 0:
causal_mask = xformers.attn_bias.LowerTriangularMask()
elif q_len <= sliding_window:
causal_mask = xformers.attn_bias.LowerTriangularMask()
else:
# Fix from https://github.com/Rypo
causal_mask = xformers.attn_bias.BlockDiagonalCausalMask\
.from_seqlens([q_len]*bsz)\
.make_local_attention(window_size = sliding_window)
pass
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
self.model._has_no_labels = labels is None
if past_key_values is not None:
outputs = LlamaModel_fast_forward_inference(
self,
input_ids,
past_key_values,
position_ids = position_ids,
attention_mask = attention_mask,
)
else:
outputs = self.model(
input_ids=input_ids,
causal_mask=causal_mask,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pass
hidden_states = outputs[0]
bsz, q_len, hd = hidden_states.shape
lm_head = self.lm_head.weight
if bsz == 1 and q_len == 1:
logits = torch.mv(lm_head, hidden_states.ravel().to(lm_head.dtype))
logits = logits.unsqueeze(0).unsqueeze(0)
else:
logits = self.lm_head(hidden_states.to(lm_head.dtype))
pass
logits = logits.to(self.config.torch_dtype)
loss = None
if labels is not None:
shift_logits = logits
if not hasattr(self, "extra_ignored_labels"):
# Fixes https://github.com/unslothai/unsloth/issues/10
self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda:0")
pass
shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
loss = fast_cross_entropy_loss(
logits = shift_logits,
labels = shift_labels,
)
pass
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
pass
# Transformers had to update for Mistral Nemo 12b since Attention is (5120, 4096) now.
def patch_mistral_nemo_attention(function):
function = function.replace(
"(self.head_dim * self.num_heads) != self.hidden_size",
"False",
)
function = function.replace(
"self.head_dim = self.hidden_size // self.num_heads",
"self.head_dim = config.head_dim",
)
function = function.replace(
"self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)",
"self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)",
)
return function
pass
class FastMistralModel(FastLlamaModel):
@staticmethod
def pre_patch():
init_name, function = patch_linear_scaling(
model_name = "mistral",
rope_module = LlamaRotaryEmbedding,
scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
attention_module = MistralAttention,
)
# Just for Mistral Nemo models!
if function is not None:
function = patch_mistral_nemo_attention(function)
# if True:#init_name is not None:
exec(function, globals())
MistralAttention.__init__ = eval(init_name)
pass
MistralAttention .forward = MistralAttention_fast_forward
MistralSdpaAttention .forward = MistralAttention_fast_forward
MistralFlashAttention2.forward = MistralAttention_fast_forward
MistralDecoderLayer .forward = LlamaDecoderLayer_fast_forward
MistralModel .forward = LlamaModel_fast_forward
MistralForCausalLM .forward = MistralForCausalLM_fast_forward
PeftModelForCausalLM .forward = PeftModelForCausalLM_fast_forward
fix_prepare_inputs_for_generation(MistralForCausalLM)
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
import transformers.models.mistral.modeling_mistral
transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding = LlamaRotaryEmbedding
return
pass
@staticmethod
def from_pretrained(
model_name = "unsloth/mistral-7b-bnb-4bit",
max_seq_length = None,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None, # Mistral does not support RoPE scaling
fix_tokenizer = True,
model_patcher = None,
tokenizer_name = None,
trust_remote_code = False,
**kwargs,
):
return FastLlamaModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
model_patcher = FastMistralModel,
tokenizer_name = tokenizer_name,
trust_remote_code = trust_remote_code,
**kwargs,
)
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .llama import *
from .llama import (
LlamaRotaryEmbedding,
LlamaLinearScalingRotaryEmbedding,
)
from transformers.models.qwen2.modeling_qwen2 import (
Qwen2Attention,
Qwen2DecoderLayer,
Qwen2Model,
Qwen2ForCausalLM,
)
# For Pytorch 2.1.1
try:
from transformers.models.qwen2.modeling_qwen2 import (
Qwen2SdpaAttention,
Qwen2FlashAttention2,
)
except:
Qwen2SdpaAttention = Qwen2Attention
Qwen2FlashAttention2 = Qwen2Attention
pass
class FastQwen2Model(FastLlamaModel):
@staticmethod
def pre_patch():
init_name, function = patch_linear_scaling(
model_name = "qwen2",
rope_module = LlamaRotaryEmbedding,
scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
attention_module = Qwen2Attention,
)
if init_name is not None:
exec(function, globals())
Qwen2Attention.__init__ = eval(init_name)
pass
Qwen2Attention .forward = LlamaAttention_fast_forward
Qwen2SdpaAttention .forward = LlamaAttention_fast_forward
Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
Qwen2DecoderLayer .forward = LlamaDecoderLayer_fast_forward
Qwen2Model .forward = LlamaModel_fast_forward
Qwen2ForCausalLM .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
fix_prepare_inputs_for_generation(Qwen2ForCausalLM)
# Solves https://github.com/unslothai/unsloth/issues/168
# Static KV Cache was introduced in 4.38.0, causing training to be much slower.
# Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
# https://github.com/huggingface/transformers/pull/27931
# https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
import transformers.models.qwen2.modeling_qwen2
transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding
return
pass
@staticmethod
def from_pretrained(
model_name = "Qwen/Qwen2-7B",
max_seq_length = 4096,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None, # Qwen2 does not support RoPE scaling
fix_tokenizer = True,
model_patcher = None,
tokenizer_name = None,
trust_remote_code = False,
**kwargs,
):
return FastLlamaModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
model_patcher = FastQwen2Model,
tokenizer_name = tokenizer_name,
trust_remote_code = trust_remote_code,
**kwargs,
)
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from bitsandbytes.nn import Linear4bit as Bnb_Linear4bit
from peft.tuners.lora import Linear4bit as Peft_Linear4bit
from peft.tuners.lora import Linear as Peft_Linear
from typing import Optional, Callable, Union, List
import torch
import os
import shutil
import pickle
import gc
from transformers.models.llama.modeling_llama import logger
from .kernels import fast_dequantize, QUANT_STATE, get_lora_parameters_bias
import subprocess
import psutil
import re
from transformers.models.llama.modeling_llama import logger
from .tokenizer_utils import fix_sentencepiece_gguf
__all__ = [
"print_quantization_methods",
"unsloth_save_model",
"save_to_gguf",
"patch_saving_functions",
]
# Check environments
keynames = "\n" + "\n".join(os.environ.keys())
IS_COLAB_ENVIRONMENT = "\nCOLAB_" in keynames
IS_KAGGLE_ENVIRONMENT = "\nKAGGLE_" in keynames
del keynames
# Weights
LLAMA_WEIGHTS = (
"self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj",
"mlp.gate_proj", "mlp.up_proj", "mlp.down_proj",
)
LLAMA_LAYERNORMS = (
"input_layernorm", "post_attention_layernorm",
"pre_feedforward_layernorm", "post_feedforward_layernorm",
)
# https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
# From https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html
ALLOWED_QUANTS = \
{
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
"bf16" : "Bfloat16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"f16" : "Float16 - Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
"q4_0" : "Original quant method, 4-bit.",
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
"q4_k_s" : "Uses Q4_K for all tensors",
"q4_k" : "alias for q4_k_m",
"q5_k" : "alias for q5_k_m",
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
"q5_k_s" : "Uses Q5_K for all tensors",
"q6_k" : "Uses Q8_K for all tensors",
# "iq2_xxs" : "2.06 bpw quantization", # Not supported sadly
# "iq2_xs" : "2.31 bpw quantization",
# "iq3_xxs" : "3.06 bpw quantization",
"q3_k_xs" : "3-bit extra small quantization",
}
def print_quantization_methods():
for key, value in ALLOWED_QUANTS.items():
print(f'"{key}" ==> {value}')
pass
pass
def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"):
if not hasattr(model, "_saved_temp_tokenizer"): return False
temp_tokenizer = model._saved_temp_tokenizer
sentencepiece_model = False
file_location = os.path.join(temporary_location, temp_tokenizer.name_or_path)
if not os.path.exists(file_location):
os.makedirs(file_location)
pass
temp_tokenizer.save_pretrained(file_location)
if os.path.isfile(f"{file_location}/tokenizer.model"):
sentencepiece_model = True
pass
shutil.rmtree(file_location, ignore_errors = True)
return sentencepiece_model
pass
def _free_cached_model(model):
from huggingface_hub import scan_cache_dir
cached_repos = list(scan_cache_dir().repos)
# Go through every cached repo, and delete the one that matches the model we want to save.
# Can save 4GB of disk space - useful for Kaggle systems.
for cached_repo in cached_repos:
if cached_repo.repo_id == model.config._name_or_path:
remove_cache_commit = list(cached_repo.revisions)[0].commit_hash
delete_strategy = scan_cache_dir().delete_revisions(remove_cache_commit,)
logger.warning_once(
"Unsloth: Will remove a cached repo with size " + \
delete_strategy.expected_freed_size_str,
)
delete_strategy.execute()
pass
pass
pass
def _merge_lora(layer, name):
bias = None
if isinstance(layer, (Bnb_Linear4bit, Peft_Linear4bit, Peft_Linear)):
# Is LoRA so we need to merge!
W, quant_state, A, B, s, bias = get_lora_parameters_bias(layer)
if quant_state is not None:
dtype = quant_state.dtype if type(quant_state) is not list else quant_state[2]
W = fast_dequantize(W, quant_state)
else:
dtype = W.dtype
W = W.to(torch.float32).t()
# W = W.t()
if A is not None:
# sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32)))
# W += sAB
W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s)
# W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s)
# if not torch.isfinite(W).all():
maximum_element = torch.max(W.min().abs(), W.max())
if not torch.isfinite(maximum_element).item():
raise ValueError(f"Unsloth: Merge failed.\n{name} has some elements = infinity.")
pass
W = W.t().to(dtype)
else:
W = layer.weight
return W, bias
pass
def fast_save_pickle(shard, name):
# Use this if # CPUs is <= 2
print(f"Unsloth: Saving {name}...")
torch.save(
shard,
name,
# HIGHEST_PROTOCOL seems to not work with Pytorch!
# pickle_module = pickle,
# pickle_protocol = pickle.HIGHEST_PROTOCOL,
)
return
pass
@torch.inference_mode
def unsloth_save_model(
model,
tokenizer,
save_directory : Union[str, os.PathLike],
save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
state_dict : Optional[dict] = None,
save_function : Callable = torch.save,
max_shard_size : Union[int, str] = "5GB",
safe_serialization : bool = True,
variant : Optional[str] = None,
save_peft_format : bool = True,
# Push to hub
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = "Trained with Unsloth",
private : Optional[bool] = None,
create_pr : bool = False,
revision : str = None,
commit_description : str = "Upload model trained with Unsloth 2x faster",
tags : List[str] = None,
# Our functions
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.9,
):
if token is None and "HF_TOKEN" in os.environ:
token = os.environ["HF_TOKEN"]
if token is None and "HUGGINGFACE_TOKEN" in os.environ:
token = os.environ["HUGGINGFACE_TOKEN"]
if commit_message is None: commit_message = ""
if "Unsloth" not in commit_message:
commit_message += " (Trained with Unsloth)"
commit_message = commit_message.lstrip()
if commit_description is None:
commit_description = "Upload model trained with Unsloth 2x faster"
elif "Unsloth 2x faster" not in commit_description:
commit_description += " (Trained with Unsloth 2x faster)"
pass
if save_method == "merged_4bit":
raise RuntimeError(
"Unsloth: Merging into 4bit will cause your model to lose accuracy if you plan\n"\
"to merge to GGUF or others later on. I suggest you to do this as a final step\n"\
"if you're planning to do multiple saves.\n"\
"If you are certain, change `save_method` to `merged_4bit_forced`."
)
elif save_method == "merged_4bit_forced":
save_method = "merged_4bit"
pass
save_pretrained_settings = dict(locals())
for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
del save_pretrained_settings[deletion]
pass
# First check for a token!
if push_to_hub:
from huggingface_hub import whoami
try:
username = whoami(token = token)["name"]
except:
raise RuntimeError(
"Unsloth: Please supply a token!\n"\
"Go to https://huggingface.co/settings/tokens"
)
pass
pass
assert(maximum_memory_usage > 0 and maximum_memory_usage <= 0.95)
# Clean memory up first
for _ in range(3):
torch.cuda.empty_cache()
gc.collect()
pass
save_method = save_method.lower().replace(" ", "_")
if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
raise RuntimeError(
"Unsloth: You must select one of 3 options when saving models:\n"\
'"lora" ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
'"merged_16bit" ==> This merges LoRA weights and saves to float16. Needed for llama.cpp / GGUF.\n'\
'"merged_4bit" ==> This merges LoRA weights and saves to 4bit. Useful for DPO / inference.'
)
pass
if save_method == "merged_4bit":
print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
print("This might take 5 minutes...")
# Counteract no LoRA adapters!
if hasattr(model, "merge_and_unload"):
model = model.merge_and_unload()
pass
print("Done.")
pass
if tags is not None:
assert(isinstance(tags, (list, tuple)))
tags = list(tags) + ["unsloth",]
else:
tags = ["unsloth",]
pass
save_pretrained_settings["tags"] = tags
if ((save_method == "lora") or (save_method == "merged_4bit")) and push_to_hub:
if token is None:
raise RuntimeError(
"Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
"Go to https://huggingface.co/settings/tokens."
)
pass
if save_method == "lora":
print("Unsloth: Saving LoRA adapters. Please wait...")
elif save_method == "merged_4bit":
print("Unsloth: Saving 4bit Bitsandbytes model. Please wait...")
pass
# Update model tag
_ = upload_to_huggingface(
model, save_directory, token,
"finetuned", "trl", file_location = None,
old_username = None, private = private,
)
getattr(model, "original_push_to_hub", tokenizer.push_to_hub)\
(
repo_id = save_directory,
use_temp_dir = use_temp_dir,
commit_message = commit_message,
private = private,
token = token,
max_shard_size = max_shard_size,
create_pr = create_pr,
safe_serialization = safe_serialization,
revision = revision,
commit_description = commit_description,
tags = tags,
)
if tokenizer is not None:
# Set padding side to left for inference
old_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
getattr(tokenizer, "original_push_to_hub", tokenizer.push_to_hub)\
(
repo_id = save_directory,
use_temp_dir = use_temp_dir,
commit_message = commit_message,
private = private,
token = token,
max_shard_size = max_shard_size,
create_pr = create_pr,
safe_serialization = safe_serialization,
revision = revision,
commit_description = commit_description,
tags = tags,
)
# Revert back padding side
tokenizer.padding_side = old_padding_side
pass
if hasattr(model, "config"):
print(f"Saved {save_method} model to https://huggingface.co/" + save_directory)
pass
return save_directory, None
pass
# Tokenizer has different saving arguments
tokenizer_save_settings = \
{
"save_directory" : save_pretrained_settings["save_directory"],
"legacy_format" : None,
"filename_prefix" : None,
"push_to_hub" : save_pretrained_settings["push_to_hub"],
"private" : save_pretrained_settings["private"],
"token" : save_pretrained_settings["token"],
}
# Check if PEFT Model or not - if yes, 3 levels. If not 2 levels.
from peft import PeftModelForCausalLM
if isinstance(model, PeftModelForCausalLM):
internal_model = model.model
else:
internal_model = model
pass
# Cannot be converted properly!
if (save_method == "merged_4bit") or (save_method == "lora") or (
not hasattr(model, "model") or \
not hasattr(internal_model.model, "layers")
):
# Do general saving
# Edit save_pretrained_settings
# [TODO] _create_repo has errors due to **kwargs getting accepted
# commit_description does not seem to work?
what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
if save_pretrained_settings["push_to_hub"] is False else \
("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
for deletion in what_to_delete:
del save_pretrained_settings[deletion]
pass
if hasattr(model, "add_model_tags"):
model.add_model_tags(["unsloth",])
# Update model tag
if push_to_hub:
_ = upload_to_huggingface(
model, save_pretrained_settings["save_directory"], token,
"finetuned", "trl", file_location = None,
old_username = None, private = private,
)
pass
if tokenizer is not None:
print("Unsloth: Saving tokenizer...", end = "")
# Set padding side to left for inference
old_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
tokenizer.save_pretrained(**tokenizer_save_settings)
# Revert back padding side
tokenizer.padding_side = old_padding_side
print(" Done.")
else:
print()
print("Unsloth: Saving model...", end = "")
if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
# [TODO] Is this correct?
if save_method == "lora":
save_pretrained_settings["selected_adapters"] = None
pass
model.save_pretrained(**save_pretrained_settings)
if push_to_hub and hasattr(model, "config"):
print("Saved to https://huggingface.co/" + save_pretrained_settings["save_directory"])
pass
print(" Done.")
return save_directory, None
pass
# If push_to_hub, we must remove the .../ part of a repo
username = None
if push_to_hub and "/" in save_directory:
# +1 solves absolute path issues
username = save_directory[:save_directory.find("/")]
new_save_directory = save_directory[save_directory.find("/")+1:]
logger.warning_once(
f"Unsloth: You are pushing to hub, but you passed your HF username = {username}.\n"\
f"We shall truncate {save_directory} to {new_save_directory}"
)
save_pretrained_settings["save_directory"] = new_save_directory
tokenizer_save_settings ["save_directory"] = new_save_directory
save_directory = new_save_directory
pass
print("Unsloth: Merging 4bit and LoRA weights to 16bit...")
# Determine max RAM usage minus sharding
max_ram = psutil.virtual_memory().available
sharded_ram_usage = 5 * 1024 * 1024 * 1024
if type(max_shard_size) is str:
gb_found = re.match("([0-9]{1,})[\s]{0,}GB", max_shard_size, flags = re.IGNORECASE)
mb_found = re.match("([0-9]{1,})[\s]{0,}MB", max_shard_size, flags = re.IGNORECASE)
if gb_found: sharded_ram_usage = int(gb_found.group(1)) * 1024 * 1024 * 1024
elif mb_found: sharded_ram_usage = int(mb_found.group(1)) * 1024 * 1024
elif type(max_shard_size) is int:
sharded_ram_usage = sharded_ram_usage
pass
# Switch to our fast saving modules if it's a slow PC!
n_cpus = psutil.cpu_count(logical = False)
if n_cpus is None: n_cpus = psutil.cpu_count()
if n_cpus is None: n_cpus = 1
if safe_serialization is None:
safe_serialization = True
save_pretrained_settings["safe_serialization"] = safe_serialization
elif safe_serialization and (n_cpus <= 2):
logger.warning_once(
f"Unsloth: You have {n_cpus} CPUs. Using `safe_serialization` is 10x slower.\n"\
f"We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.\n"\
f"To force `safe_serialization`, set it to `None` instead.",
)
safe_serialization = False
save_function = fast_save_pickle
save_pretrained_settings["safe_serialization"] = safe_serialization
save_pretrained_settings["save_function"] = save_function
pass
# Only safe_serialization uses more RAM
if safe_serialization:
max_ram -= sharded_ram_usage
else:
max_ram -= sharded_ram_usage*0.25 # Uses much less
pass
max_ram = int(max(0, max_ram) * maximum_memory_usage)
print(f"Unsloth: Will use up to "\
f"{round(max_ram/1024/1024/1024, 2)} out of "\
f"{round(psutil.virtual_memory().total/1024/1024/1024, 2)} RAM for saving.")
# Max directory for disk saving
if not os.path.exists(temporary_location):
os.makedirs(temporary_location)
pass
# Check if Kaggle or Colab, since only 20GB of Disk space allowed.
if IS_KAGGLE_ENVIRONMENT or IS_COLAB_ENVIRONMENT:
# We free up 4GB of space
logger.warning_once(
"Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n"\
"model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab."
)
_free_cached_model(internal_model)
pass
# HF also uses a OrderedDict
from collections import OrderedDict
state_dict = OrderedDict()
torch_dtype = internal_model.config.torch_dtype
if type(torch_dtype) is str:
if torch_dtype == "float16": torch_dtype = torch.float16
elif torch_dtype == "bfloat16": torch_dtype = torch.bfloat16
pass
# Check modules to save float32 dtype
state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data.to(torch_dtype)
max_vram = int(torch.cuda.get_device_properties(0).total_memory * maximum_memory_usage)
from tqdm import tqdm as ProgressBar
for j, layer in enumerate(ProgressBar(internal_model.model.layers)):
for item in LLAMA_WEIGHTS:
proj = eval(f"layer.{item}")
name = f"model.layers.{j}.{item}.weight"
W, bias = _merge_lora(proj, name)
# Bias term
if bias is not None:
state_dict[f"model.layers.{j}.{item}.bias"] = bias
pass
if (torch.cuda.memory_allocated() + W.nbytes) < max_vram:
# Save to GPU memory
state_dict[name] = W
# [TODO] Saving to RAM seems to leak memory???
# elif (max_ram - W.nbytes) > 0:
# # Save to CPU memory
# logger.warning_once(f"We will save to RAM and not VRAM now.")
# state_dict[name] = W.to("cpu", non_blocking = True, copy = True)
# max_ram = max(max_ram - W.nbytes, 0)
else:
# Save to Disk
logger.warning_once(f"We will save to Disk and not RAM now.")
filename = os.path.join(temporary_location, f"{name}.pt")
torch.save(W, filename, pickle_module = pickle, pickle_protocol = pickle.HIGHEST_PROTOCOL,)
state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True)
pass
for item in LLAMA_LAYERNORMS:
try:
# Skip for Gemma 2
state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data")
except:
continue
pass
pass
state_dict["model.norm.weight"] = internal_model.model.norm.weight.data
# Check for modules_to_save float32 dtype
# Check for tied weights
if internal_model.model.embed_tokens.weight.data_ptr() != internal_model.lm_head.weight.data_ptr():
state_dict["lm_head.weight"] = internal_model.lm_head.weight.data.to(torch_dtype)
pass
# All tensors MUST be type torch.Tensor and not torch.nn.parameter.Parameter
for key, value in state_dict.items():
if hasattr(value, "data"): state_dict[key] = value = value.data
if type(value) is not torch.Tensor:
logger.warning_once(f"Unsloth: {key} is not a Tensor but a {type(value)}.")
pass
pass
# Edit save_pretrained_settings
# [TODO] _create_repo has errors due to **kwargs getting accepted
save_pretrained_settings["state_dict"] = state_dict
# commit_description does not seem to work?
what_to_delete = ("use_temp_dir", "commit_message", "create_pr", "revision", "commit_description", "tags",) \
if not push_to_hub else \
("use_temp_dir", "create_pr", "revision", "tags", "commit_description",)
for deletion in what_to_delete:
del save_pretrained_settings[deletion]
pass
if hasattr(model, "add_model_tags"):
model.add_model_tags(["unsloth",])
# Update model tag
if push_to_hub:
_ = upload_to_huggingface(
model, save_pretrained_settings["save_directory"], token,
"finetuned", "trl", file_location = None,
old_username = username, private = private,
)
pass
# First check if we're pushing to an organization!
save_directory = save_pretrained_settings["save_directory"]
if save_pretrained_settings["push_to_hub"]:
new_save_directory, new_username = _determine_username(save_directory, username, token)
if token is not None:
from huggingface_hub import whoami
actual_username = whoami(token = token)["name"]
else:
actual_username = username
pass
# Check if pushing to an organization
if save_pretrained_settings["push_to_hub"] and (username != actual_username):
print(f"Unsloth: Saving to organization with address {new_save_directory}")
# We upload everything at the end!
tokenizer_save_settings["push_to_hub"] = False
tokenizer_save_settings["save_directory"] = new_save_directory
pass
# Save tokenizer
if tokenizer is not None:
print("Unsloth: Saving tokenizer...", end = "")
# Set padding side to left for inference
old_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
tokenizer.save_pretrained(**tokenizer_save_settings)
# Revert back padding side
tokenizer.padding_side = old_padding_side
print(" Done.")
else:
print()
pass
print("Unsloth: Saving model... This might take 5 minutes for Llama-7b...")
# Since merged, edit quantization_config
old_config = model.config
new_config = model.config.to_dict()
if "quantization_config" in new_config:
del new_config["quantization_config"]
original_model = model
new_config = type(model.config).from_dict(new_config)
while hasattr(original_model, "model"):
original_model = original_model.model
original_model.config = new_config
model.config = new_config
# Save!
# [TODO] --> is this correct?
# save_pretrained_settings["selected_adapters"] = None
# Check if pushing to an organization
if save_pretrained_settings["push_to_hub"] and (username != actual_username):
print(f"Unsloth: Saving to organization with address {new_save_directory}")
# Pushing to organization!
# Sadly .save_pretrained doesn't work :(
# We first save it via .save_pretrained, then upload manually!
save_pretrained_settings["save_directory"] = new_save_directory
save_pretrained_settings["push_to_hub"] = False
internal_model.save_pretrained(**save_pretrained_settings)
# Now manually go through each file and upload them manually!
filenames = os.listdir(new_save_directory)
from huggingface_hub import HfApi
hf_api = HfApi(token = save_pretrained_settings["token"])
print("Unsloth: Uploading all files... Please wait...")
hf_api.upload_folder(
folder_path = new_save_directory,
path_in_repo = ".",
repo_id = new_save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
ignore_patterns = "*.md",
)
else:
internal_model.save_pretrained(**save_pretrained_settings)
pass
# Revert config back
original_model = model
while hasattr(original_model, "model"):
original_model = original_model.model
original_model.config = old_config
model.config = old_config
print("Done.")
if push_to_hub and hasattr(model, "config"):
print(f"Saved merged model to https://huggingface.co/{username}/{save_directory.lstrip('/')}")
pass
save_pretrained_settings["state_dict"] = None
for j, (key, value) in enumerate(state_dict.items()):
state_dict[key] = None
if j % 10 == 0:
torch.cuda.empty_cache()
gc.collect()
pass
pass
state_dict = None
del state_dict
torch.cuda.empty_cache()
gc.collect()
# Remove temporary location
import shutil
shutil.rmtree(temporary_location, ignore_errors = True)
for _ in range(3):
torch.cuda.empty_cache()
gc.collect()
return save_directory, username
pass
def install_llama_cpp_clone_non_blocking():
full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"]
run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
return run_installer
pass
def install_llama_cpp_make_non_blocking():
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# env = { **os.environ, "LLAMA_CUDA": "1", }
n_jobs = max(int(psutil.cpu_count()*1.5), 1)
# Force make clean
os.system("make clean -C llama.cpp")
full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"]
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
return run_installer
pass
def install_python_non_blocking(packages = []):
full_command = ["pip", "install"] + packages
run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
return run_installer
pass
def install_llama_cpp_old(version = -10):
# Download the 10th latest release since the latest might be broken!
# FALLBACK mechanism
releases = subprocess.check_output(["git", "ls-remote", "--tags", "https://github.com/ggerganov/llama.cpp.git"])
releases = releases.decode("utf-8").replace("\t", " ").split("\n")
for i, x in enumerate(releases):
if "refs/tags/b" not in x: break
releases = releases[:i]
latest = releases[-1]
version = releases[version].split(" ")[0]
# Check if the llama.cpp exists
if os.path.exists("llama.cpp"):
print(
"**[WARNING]** You have a llama.cpp old directory which is broken.\n"\
"Unsloth will DELETE the broken directory and install a new one.\n"\
"Press CTRL + C / cancel this if this is wrong. We shall wait 10 seconds.\n"
)
import time
for i in range(10):
print(f"**[WARNING]** Deleting llama.cpp directory... {10-i} seconds left.")
time.sleep(1)
import shutil
shutil.rmtree("llama.cpp", ignore_errors = True)
pass
# Clone a specific commit
# Also don't use the GPU!
commands = [
"git clone --recursive https://github.com/ggerganov/llama.cpp",
f"cd llama.cpp && git reset --hard {version} && git clean -df",
"make clean -C llama.cpp",
f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
]
for command in commands:
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
pass
pass
# Check if successful
if not os.path.exists("llama.cpp/quantize") and not os.path.exists("llama.cpp/llama-quantize"):
raise RuntimeError(
"Unsloth: The file 'llama.cpp/llama-quantize' or `llama.cpp/quantize` does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
)
pass
pass
def install_llama_cpp_blocking(use_cuda = False):
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
commands = [
"git clone --recursive https://github.com/ggerganov/llama.cpp",
"make clean -C llama.cpp",
# https://github.com/ggerganov/llama.cpp/issues/7062
# Weirdly GPU conversion for GGUF breaks??
# f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
"pip install gguf protobuf",
]
if os.path.exists("llama.cpp"): return
for command in commands:
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
pass
pass
pass
def get_executable(executables):
# Get system locations (System Path).split(system separator)
system_directories = os.environ.get("PATH").split(os.pathsep)
for directory in system_directories:
for executable in executables:
path = os.path.join(directory, executable)
# Check if the executable exists and is executable
if os.path.exists(path) and os.access(path, os.X_OK): return path
pass
pass
return None
pass
def save_to_gguf(
model_type : str,
model_dtype : str,
is_sentencepiece : bool = False,
model_directory : str = "unsloth_finetuned_model",
quantization_method = "fast_quantized", # Can be a list of options! ["q4_k_m", "q8_0", "q5_k_m"]
first_conversion : str = None,
_run_installer = None, # Non blocking install of llama.cpp
):
# logger.warning(
# "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
# "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
# "Please be patient - GGUF saving should still work, but might not work as well."
# )
assert(model_dtype == "float16" or model_dtype == "bfloat16")
model_dtype = "f16" if model_dtype == "float16" else "bf16"
# Convert quantization_method to list
if isinstance(quantization_method, list): pass
elif isinstance(quantization_method, str): quantization_method = [ quantization_method, ]
elif isinstance(quantization_method, tuple): quantization_method = list(quantization_method)
else:
raise TypeError("Unsloth: quantization_method can only be a string or a list of strings")
pass
# Check if bfloat16 is supported
if model_dtype == "bf16" and not torch.cuda.is_bf16_supported():
logger.warning(
"Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
"We shall switch instead to f16."
)
model_dtype = "f16"
pass
# Check first_conversion as well
if first_conversion is None:
first_conversion = model_dtype
pass
# Check I quants
for quant_method in quantization_method:
if quant_method.startswith("iq2"):
raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")
pass
# Careful convert.py is only for Llama / Mistral based archs
use_fast_convert = False
if not is_sentencepiece: use_fast_convert = False # Llama-3
elif model_type == "llama": use_fast_convert = True
elif model_type == "mistral": use_fast_convert = True
pass
logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
# Map quant methods
new_quantization_method = []
for quant_method in quantization_method:
if quant_method == "not_quantized": quant_method = model_dtype
elif quant_method == "fast_quantized": quant_method = "q8_0"
elif quant_method == "quantized": quant_method = "q4_k_m"
elif quant_method is None: quant_method = "q8_0"
# Check if wrong method
if quant_method not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quant_method}] not supported. Choose from below:\n"
for key, value in ALLOWED_QUANTS.items():
error += f"[{key}] => {value}\n"
raise RuntimeError(error)
pass
new_quantization_method.append(quant_method)
pass
quantization_method = new_quantization_method
print_info = \
f"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n"\
f" \\\ /| [0] Installing llama.cpp will take 3 minutes.\n"\
f"O^O/ \_/ \\ [1] Converting HF to GGUF 16bits will take 3 minutes.\n"\
f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 10 minutes each.\n"\
f' "-____-" In total, you will have to wait at least 16 minutes.\n'
print(print_info)
# Check first_conversion format
if first_conversion == "f16" : pass
elif first_conversion == "bf16" : pass
elif first_conversion == "f32" : pass
elif first_conversion == "q8_0" : pass
else:
raise RuntimeError(
f"Unsloth: `first_conversion` can only be one of ['f16', 'bf16', 'f32', 'q8_0'] and not `{first_conversion}`."
)
pass
# Determine whether the system already has llama.cpp installed and the scripts are executable
quantize_location = get_executable(["llama-quantize", "quantize"])
convert_location = get_executable(["convert-hf-to-gguf.py", "convert_hf_to_gguf.py"])
if quantize_location is not None and convert_location is not None:
print("Unsloth: llama.cpp found in the system. We shall skip installation.")
else:
print("Unsloth: [0] Installing llama.cpp. This will take 3 minutes...")
if _run_installer is not None:
error = _run_installer.wait()
else:
error = 0
install_llama_cpp_blocking()
pass
# Check if successful. If not install 10th latest release
# Careful llama.cpp/quantize changed to llama.cpp/llama-quantize
# and llama.cpp/main changed to llama.cpp/llama-cli
# See https://github.com/ggerganov/llama.cpp/pull/7809
quantize_location = None
if os.path.exists("llama.cpp/quantize"):
quantize_location = "llama.cpp/quantize"
elif os.path.exists("llama.cpp/llama-quantize"):
quantize_location = "llama.cpp/llama-quantize"
else:
raise RuntimeError(
"Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
)
pass
# See https://github.com/unslothai/unsloth/pull/730
# Filenames changed again!
convert_location = None
if os.path.exists("llama.cpp/convert-hf-to-gguf.py"):
convert_location = "llama.cpp/convert-hf-to-gguf.py"
elif os.path.exists("llama.cpp/convert_hf_to_gguf.py"):
convert_location = "llama.cpp/convert_hf_to_gguf.py"
else:
raise RuntimeError(
"Unsloth: The file 'llama.cpp/convert-hf-to-gguf.py' or 'llama.cpp/convert_hf_to_gguf.py' does not exist.\n"\
"But we expect this file to exist! Maybe the llama.cpp developers changed the name?"
)
pass
if error != 0 or quantize_location is None or convert_location is None:
print(f"Unsloth: llama.cpp error code = {error}.")
install_llama_cpp_old(-10)
pass
pass
# Determine maximum first_conversion state
if first_conversion == "f32" : strength = 3
elif first_conversion == "f16" : strength = 2
elif first_conversion == "bf16" : strength = 1
elif first_conversion == "q8_0" : strength = 0
for quant_method in quantization_method:
if quant_method == "f32": strength = max(strength, 3)
elif quant_method == "f16": strength = max(strength, 2)
elif quant_method == "bf16": strength = max(strength, 1)
elif quant_method == "q8_0": strength = max(strength, 0)
else:
# Quantized models must have f16 as the default argument
if first_conversion == "f32" : pass
elif first_conversion == "f16" : pass
elif first_conversion == "bf16" : pass
elif first_conversion == "q8_0":
logger.warning_once(
"Unsloth: Using q8_0 for the `first_conversion` will lose a bit of accuracy, "\
"but saves disk space!"
)
# first_conversion = "f16"
pass
pass
pass
# If only q8_0:
if len(quantization_method) == 1 and quantization_method[0] == "q8_0":
strength = 0
pass
if strength >= 3: first_conversion = "f32"
elif strength >= 2: first_conversion = "f16"
elif strength >= 1: first_conversion = "bf16"
else: first_conversion = "q8_0"
# Non llama/mistral needs can only use f32 or f16
if not use_fast_convert and \
(first_conversion != "f16" or first_conversion != "bf16" or first_conversion != "f32"):
pass
# Latest llama.cpp works for all models for q8_0!
# logger.warning_once("Unsloth: We must use f16 for non Llama and Mistral models.")
# first_conversion = "f16"
pass
# Check if bfloat16 is supported
if first_conversion == "bf16" and not torch.cuda.is_bf16_supported():
logger.warning(
"Unsloth: Cannot convert to bf16 GGUF since your computer doesn't support it.\n"\
"We shall switch instead to f16."
)
first_conversion = "f16"
pass
n_cpus = psutil.cpu_count()
if n_cpus is None: n_cpus = 1
n_cpus *= 2
# Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
final_location = f"./{model_directory}/unsloth.{first_conversion.upper()}.gguf"
print(f"Unsloth: [1] Converting model at {model_directory} into {first_conversion} GGUF format.\n"\
f"The output location will be {final_location}\n"\
"This will take 3 minutes...")
# We first check if tokenizer.model exists in the model_directory
if os.path.exists(f"{model_directory}/tokenizer.model"):
vocab_type = "spm,hfft,bpe"
# Fix Sentencepiece model as well!
fix_sentencepiece_gguf(model_directory)
else:
vocab_type = "bpe"
pass
# convert.py is deprecated!
use_fast_convert = False
if use_fast_convert:
command = f"python llama.cpp/convert.py {model_directory} "\
f"--outfile {final_location} --vocab-type {vocab_type} "\
f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
else:
command = f"python {convert_location} {model_directory} "\
f"--outfile {final_location} "\
f"--outtype {first_conversion}"
pass
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
if sp.returncode is not None and sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, sp.args)
pass
# Check if quantization succeeded!
if not os.path.isfile(final_location):
if IS_KAGGLE_ENVIRONMENT:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You are in a Kaggle environment, which might be the reason this is failing.\n"\
"Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
"This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
"`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
"I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
)
else:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You might have to compile llama.cpp yourself, then run this again.\n"\
"You do not need to close this Python program. Run the following commands in a new terminal:\n"\
"You must run this in the same folder as you're saving your model.\n"\
"git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
"cd llama.cpp && make clean && make all -j\n"\
"Once that's done, redo the quantization."
)
pass
pass
print(f"Unsloth: Conversion completed! Output location: {final_location}")
full_precision_location = final_location
all_saved_locations = [full_precision_location,]
# Convert each type!
for quant_method in quantization_method:
if quant_method != first_conversion:
print(f"Unsloth: [2] Converting GGUF 16bit into {quant_method}. This will take 20 minutes...")
final_location = f"./{model_directory}/unsloth.{quant_method.upper()}.gguf"
command = f"./{quantize_location} {full_precision_location} "\
f"{final_location} {quant_method} {n_cpus}"
# quantize uses stderr
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
line = line.decode("utf-8", errors = "replace")
if "undefined reference" in line:
raise RuntimeError("Failed compiling llama.cpp. Please report this ASAP!")
print(line, flush = True, end = "")
if sp.returncode is not None and sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, sp.args)
pass
# Check if quantization succeeded!
if not os.path.isfile(final_location):
if IS_KAGGLE_ENVIRONMENT:
raise RuntimeError(
f"Unsloth: Quantization failed for {final_location}\n"\
"You are in a Kaggle environment, which might be the reason this is failing.\n"\
"Kaggle only provides 20GB of disk space. Merging to 16bit for 7b models use 16GB of space.\n"\
"This means using `model.{save_pretrained/push_to_hub}_merged` works, but\n"\
"`model.{save_pretrained/push_to_hub}_gguf will use too much disk space.\n"\
"I suggest you to save the 16bit model first, then use manual llama.cpp conversion."
)
else:
raise RuntimeError(
"Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
"You do not need to close this Python program. Run the following commands in a new terminal:\n"\
"You must run this in the same folder as you're saving your model.\n"\
"git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
"cd llama.cpp && make clean && make all -j\n"\
"Once that's done, redo the quantization."
)
pass
pass
print(f"Unsloth: Conversion completed! Output location: {final_location}")
all_saved_locations.append(final_location)
pass
pass
return all_saved_locations
pass
def unsloth_save_pretrained_merged(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
state_dict : Optional[dict] = None,
save_function : Callable = torch.save,
max_shard_size : Union[int, str] = "5GB",
safe_serialization : bool = True,
variant : Optional[str] = None,
save_peft_format : bool = True,
tags : List[str] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.75,
):
"""
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `save_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
"""
if tokenizer is None:
logger.warning_once(
"Unsloth: You're not saving a tokenizer as well?\n"\
"You can do it separately via `tokenizer.save_pretrained(...)`"
)
pass
arguments = dict(locals())
arguments["model"] = self
del arguments["self"]
unsloth_save_model(**arguments)
for _ in range(3):
gc.collect()
pass
def unsloth_push_to_hub_merged(
self,
repo_id : str,
tokenizer = None,
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = "Trained with Unsloth",
private : Optional[bool] = None,
token : Union[bool, str, None] = None,
max_shard_size : Union[int, str, None] = "5GB",
create_pr : bool = False,
safe_serialization : bool = True,
revision : str = None,
commit_description : str = "Upload model trained with Unsloth 2x faster",
tags : Optional[List[str]] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.75,
):
"""
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `save_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
"""
if tokenizer is None:
logger.warning_once(
"Unsloth: You're not saving a tokenizer as well?\n"\
"You can do it separately via `tokenizer.push_to_hub(...)`"
)
pass
arguments = dict(locals())
arguments["model"] = self
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = True
del arguments["self"]
del arguments["repo_id"]
unsloth_save_model(**arguments)
for _ in range(3):
gc.collect()
pass
MODEL_CARD = \
"""---
base_model: {base_model}
tags:
- text-generation-inference
- transformers
- unsloth
- {model_type}
- {extra}
license: apache-2.0
language:
- en
---
# Uploaded {method} model
- **Developed by:** {username}
- **License:** apache-2.0
- **Finetuned from model :** {base_model}
This {model_type} model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
"""
def _determine_username(save_directory, old_username, token):
username = ""
save_directory = save_directory.lstrip("./")
if "/" not in save_directory:
from huggingface_hub import whoami
try:
username = whoami(token = token)["name"]
if type(old_username) is str and username != old_username:
username = old_username
pass
save_directory = f"{username}/{save_directory}"
except:
raise RuntimeError(f"Unsloth: {save_directory} is not a Huggingface directory.")
else:
username = save_directory.split("/")[0]
pass
return save_directory, username
pass
def upload_to_huggingface(
model,
save_directory,
token,
method,
extra = "",
file_location = None,
old_username = None,
private = None,
):
save_directory, username = _determine_username(save_directory, old_username, token)
from huggingface_hub import create_repo
try:
create_repo(
repo_id = save_directory,
token = token,
repo_type = "model",
exist_ok = False,
private = private,
)
# Create model card
from huggingface_hub import ModelCard
content = MODEL_CARD.format(
username = username,
base_model = model.config._name_or_path,
model_type = model.config.model_type,
method = "",
extra = extra,
)
card = ModelCard(content)
card.push_to_hub(save_directory, token = token)
except:
pass
if file_location is not None:
# Now upload file
from huggingface_hub import HfApi
hf_api = HfApi(token = token)
if "/" in file_location:
uploaded_location = file_location[file_location.rfind("/")+1:]
else:
uploaded_location = file_location
pass
# find ftevent file from tensorboard and upload it
import glob
ftevent_files = glob.glob("*out.tfevents*", recursive = True)
if len(ftevent_files) > 0:
print("Unsloth: Uploading tensorboard files... Please wait...", file_location + "*out.tfevents*")
for ftevent_file in ftevent_files:
hf_api.upload_file(
path_or_fileobj = ftevent_file,
path_in_repo = ftevent_file.replace(file_location, ""),
repo_id = save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
)
hf_api.upload_file(
path_or_fileobj = file_location,
path_in_repo = uploaded_location,
repo_id = save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
)
# We also upload a config.json file
import json
with open("_temporary_unsloth_config.json", "w") as file:
json.dump({"model_type" : model.config.model_type}, file, indent = 4)
pass
hf_api.upload_file(
path_or_fileobj = "_temporary_unsloth_config.json",
path_in_repo = "config.json",
repo_id = save_directory,
repo_type = "model",
commit_message = "(Trained with Unsloth)",
)
os.remove("_temporary_unsloth_config.json")
pass
return username
pass
def fix_tokenizer_bos_token(tokenizer):
# Check if BOS added already, then warn
fix_bos_token = False
chat_template = getattr(tokenizer, "chat_template", None)
if (tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None)):
if chat_template is not None and \
(
tokenizer.bos_token in chat_template or \
"{bos_token}" in chat_template.replace(" ", "") or \
"{bos_token+" in chat_template.replace(" ", "")
):
fix_bos_token = True
logger.warning(
f"Unsloth: ##### The current model auto adds a BOS token.\n"\
"Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily."
)
# Remove {{bos_token}}
new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\}[\s]{0,}\}", "", chat_template)
# Remove {{bos_token +
new_chat_template = re.sub(r"\{[\s]{0,}\{[\s]{0,}bos\_token[\s]{0,}\+[\s]{0,}", "", new_chat_template)
tokenizer.chat_template = new_chat_template
pass
pass
return fix_bos_token, chat_template
pass
def create_ollama_modelfile(tokenizer, gguf_location):
"""
Creates an Ollama Modelfile.
Use ollama.create(model = "new_ollama_model", modelfile = modelfile)
"""
modelfile = getattr(tokenizer, "_ollama_modelfile", None)
if modelfile is None: return None
modelfile = modelfile\
.replace("{{", "⚫@✅#🦥")\
.replace("}}", "⚡@🦥#⛵")\
.format(
__FILE_LOCATION__ = gguf_location,
)\
.replace("⚫@✅#🦥", "{{")\
.replace("⚡@🦥#⛵", "}}")\
.rstrip()
pass
return modelfile
pass
def unsloth_save_pretrained_gguf(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
quantization_method : str = "fast_quantized",
first_conversion : str = None,
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
private : Optional[bool] = None,
is_main_process : bool = True,
state_dict : Optional[dict] = None,
save_function : Callable = torch.save,
max_shard_size : Union[int, str] = "5GB",
safe_serialization : bool = True,
variant : Optional[str] = None,
save_peft_format : bool = True,
tags : List[str] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.85,
):
"""
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 then converted to GGUF / llama.cpp format.
Choose for `quantization_method` to be:
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
"f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
"q4_0" : "Original quant method, 4-bit.",
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
"q4_k_s" : "Uses Q4_K for all tensors",
"q4_k" : "alias for q4_k_m",
"q5_k" : "alias for q5_k_m",
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
"q5_k_s" : "Uses Q5_K for all tensors",
"q6_k" : "Uses Q8_K for all tensors",
"iq2_xxs" : "2.06 bpw quantization",
"iq2_xs" : "2.31 bpw quantization",
"iq3_xxs" : "3.06 bpw quantization",
"q3_k_xs" : "3-bit extra small quantization",
"""
if tokenizer is None:
raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["push_to_hub"] = False # We save ourselves
arguments["save_method"] = "merged_16bit" # Must be 16bit
del arguments["self"]
del arguments["quantization_method"]
del arguments["first_conversion"]
# Fix tokenizer adding an extra BOS token at the front
fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
# Non blocking install GGUF first
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
else:
try:
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
except:
# Retry by recloning llama.cpp
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
pass
pass
# Use old chat template if the bos is removed
if fix_bos_token:
tokenizer.chat_template = old_chat_template
pass
for _ in range(3):
gc.collect()
model_dtype = self.config.torch_dtype
model_type = self.config.model_type
if type(model_dtype) is str:
assert(model_dtype == "float16" or model_dtype == "bfloat16")
elif model_dtype == torch.float16:
model_dtype = "float16"
elif model_dtype == torch.bfloat16:
model_dtype = "bfloat16"
else:
raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
pass
is_sentencepiece_model = check_if_sentencepiece_model(self)
# Save to GGUF
all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)
# Save Ollama modelfile
modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
modelfile_location = None
if modelfile is not None:
modelfile_location = os.path.join(new_save_directory, "Modelfile")
with open(modelfile_location, "w") as file:
file.write(modelfile)
pass
print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
pass
if fix_bos_token:
logger.warning(
f"Unsloth: ##### The current model auto adds a BOS token.\n"\
"Unsloth: ##### We removed it in GGUF's chat template for you."
)
pass
if push_to_hub:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
for file_location in all_file_locations:
username = upload_to_huggingface(
self, save_directory, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
print(f"Saved GGUF to https://huggingface.co/{link}")
pass
# Save modelfile
if modelfile_location is not None:
username = upload_to_huggingface(
self, save_directory, token,
"GGUF converted", "gguf", modelfile_location, old_username, private,
)
print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
pass
pass
pass
def unsloth_push_to_hub_gguf(
self,
repo_id : str,
tokenizer = None,
quantization_method : str = "fast_quantized",
first_conversion : str = None,
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = "Trained with Unsloth",
private : Optional[bool] = None,
token : Union[bool, str, None] = None,
max_shard_size : Union[int, str, None] = "5GB",
create_pr : bool = False,
safe_serialization : bool = True,
revision : str = None,
commit_description : str = "Upload model trained with Unsloth 2x faster",
tags : Optional[List[str]] = None,
temporary_location : str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage : float = 0.85,
):
"""
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 then converted to GGUF / llama.cpp format.
Choose for `quantization_method` to be:
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
"f32" : "Not recommended. Retains 100% accuracy, but super slow and memory hungry.",
"f16" : "Fastest conversion + retains 100% accuracy. Slow and memory hungry.",
"q8_0" : "Fast conversion. High resource use, but generally acceptable.",
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
"q4_0" : "Original quant method, 4-bit.",
"q4_1" : "Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.",
"q4_k_s" : "Uses Q4_K for all tensors",
"q5_0" : "Higher accuracy, higher resource usage and slower inference.",
"q5_1" : "Even higher accuracy, resource usage and slower inference.",
"q5_k_s" : "Uses Q5_K for all tensors",
"q6_k" : "Uses Q8_K for all tensors",
"""
if tokenizer is None:
raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = False # We save ourselves
arguments["save_method"] = "merged_16bit" # Must be 16bit
del arguments["self"]
del arguments["repo_id"]
del arguments["quantization_method"]
del arguments["first_conversion"]
# Fix tokenizer adding an extra BOS token at the front
fix_bos_token, old_chat_template = fix_tokenizer_bos_token(tokenizer)
# Non blocking install GGUF first
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
else:
try:
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
except:
# Retry by recloning llama.cpp
if IS_KAGGLE_ENVIRONMENT:
# Kaggle is weird - no blocking installs, and no CUDA?
python_install = install_python_non_blocking(["gguf", "protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda = False)
new_save_directory, old_username = unsloth_save_model(**arguments)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["gguf", "protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
new_save_directory, old_username = unsloth_save_model(**arguments)
python_install.wait()
pass
pass
pass
# Use old chat template if the bos is removed
if fix_bos_token:
tokenizer.chat_template = old_chat_template
pass
for _ in range(3):
gc.collect()
model_dtype = self.config.torch_dtype
model_type = self.config.model_type
if type(model_dtype) is str:
assert(model_dtype == "float16" or model_dtype == "bfloat16")
elif model_dtype == torch.float16:
model_dtype = "float16"
elif model_dtype == torch.bfloat16:
model_dtype = "bfloat16"
else:
raise TypeError("Unsloth: Model dtype can only be float16 or bfloat16")
pass
is_sentencepiece_model = check_if_sentencepiece_model(self)
# Save to GGUF
all_file_locations = save_to_gguf(model_type, model_dtype, is_sentencepiece_model,
new_save_directory, quantization_method, first_conversion, makefile,
)
# Save Ollama modelfile
modelfile = create_ollama_modelfile(tokenizer, all_file_locations[0])
modelfile_location = None
if modelfile is not None:
modelfile_location = os.path.join(new_save_directory, "Modelfile")
with open(modelfile_location, "w") as file:
file.write(modelfile)
pass
print(f"Unsloth: Saved Ollama Modelfile to {modelfile_location}")
pass
for file_location in all_file_locations:
print("Unsloth: Uploading GGUF to Huggingface Hub...")
username = upload_to_huggingface(
self, repo_id, token,
"GGUF converted", "gguf", file_location, old_username, private,
)
link = f"{username}/{new_save_directory.lstrip('/.')}" \
if username not in new_save_directory else \
new_save_directory.lstrip('/.')
print(f"Saved GGUF to https://huggingface.co/{link}")
pass
# Save modelfile
if modelfile_location is not None:
username = upload_to_huggingface(
self, repo_id, token,
"GGUF converted", "gguf", modelfile_location, old_username, private,
)
print(f"Saved Ollama Modelfile to https://huggingface.co/{link}")
pass
if fix_bos_token:
logger.warning(
f"Unsloth: ##### The current model auto adds a BOS token.\n"\
"Unsloth: ##### We removed it in GGUF's chat template for you."
)
pass
pass
# Corrected function to save LoRA to a custom directory
def save_lora_to_custom_dir(model, tokenizer, save_directory):
# Create the custom directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)
# Call the unsloth_save_model function with the custom directory
unsloth_save_model(
model,
tokenizer,
save_directory=save_directory,
save_method="lora",
push_to_hub=False,
)
# Corrected method within the model class to convert LoRA to GGML and push to Hugging Face Hub
def unsloth_convert_lora_to_ggml_and_push_to_hub(
self,
tokenizer,
repo_id: str,
use_temp_dir: Optional[bool] = None,
commit_message: Optional[str] = "Converted LoRA to GGML with Unsloth",
private: Optional[bool] = None,
token: Union[bool, str, None] = None,
create_pr: bool = False,
revision: str = None,
commit_description: str = "Convert LoRA to GGML format using Unsloth",
temporary_location: str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage: float = 0.85,
):
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
python_install = install_python_non_blocking(["protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda=False)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
python_install.wait()
else:
makefile = None
for _ in range(3):
gc.collect()
lora_directory_push = "lora-to-ggml-push"
save_lora_to_custom_dir(self, tokenizer, lora_directory_push)
model_type = self.config.model_type
output_file = os.path.join(lora_directory_push, "ggml-adapter-model.bin")
print(f"Unsloth: Converting auto-saved LoRA adapters at {lora_directory_push} to GGML format.")
print(f"The output file will be {output_file}")
command = f"python3 llama.cpp/convert-lora-to-ggml.py {lora_directory_push} {output_file} llama"
try:
with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
for line in sp.stdout:
print(line, end="", flush=True)
for line in sp.stderr:
print(line, end="", flush=True)
sp.wait()
if sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, command)
except subprocess.CalledProcessError as e:
print(f"Error: Conversion failed with return code {e.returncode}")
return
print(f"Unsloth: Conversion completed! Output file: {output_file}")
print("Unsloth: Uploading GGML file to Hugging Face Hub...")
username = upload_to_huggingface(
self, repo_id, token,
"GGML converted LoRA", "ggml", output_file, None, private,
)
link = f"{repo_id.lstrip('/')}"
print("Unsloth: Done.")
print(f"Converted LoRA to GGML and uploaded to https://huggingface.co/{link}")
print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
def unsloth_convert_lora_to_ggml_and_save_locally(
self,
save_directory: str, # Added parameter for the folder name
tokenizer,
temporary_location: str = "_unsloth_temporary_saved_buffers",
maximum_memory_usage: float = 0.85,
):
if not os.path.exists("llama.cpp"):
if IS_KAGGLE_ENVIRONMENT:
python_install = install_python_non_blocking(["protobuf"])
python_install.wait()
install_llama_cpp_blocking(use_cuda=False)
makefile = None
else:
git_clone = install_llama_cpp_clone_non_blocking()
python_install = install_python_non_blocking(["protobuf"])
git_clone.wait()
makefile = install_llama_cpp_make_non_blocking()
python_install.wait()
else:
makefile = None
for _ in range(3):
gc.collect()
# Use the provided save_directory for local saving
save_lora_to_custom_dir(self, tokenizer, save_directory)
model_type = self.config.model_type
output_file = os.path.join(save_directory, "ggml-adapter-model.bin")
print(f"Unsloth: Converting auto-saved LoRA adapters at {save_directory} to GGML format.")
print(f"The output file will be {output_file}")
command = f"python3 llama.cpp/convert-lora-to-ggml.py {save_directory} {output_file} llama"
try:
with subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as sp:
for line in sp.stdout:
print(line, end="", flush=True)
for line in sp.stderr:
print(line, end="", flush=True)
sp.wait()
if sp.returncode != 0:
raise subprocess.CalledProcessError(sp.returncode, command)
except subprocess.CalledProcessError as e:
print(f"Error: Conversion failed with return code {e.returncode}")
return
print("Unsloth: Done.")
print(f"Unsloth: Conversion completed! Output file: {output_file}")
print("\nThis GGML making function was made by Maheswar. Ping him @Maheswar on the Unsloth Discord or on HuggingFace (@mahiatlinux) if you like this!")
def patch_saving_functions(model):
import inspect
import types
from typing import Callable, Optional, Union, List
# And now re add our saving methods!
if model.push_to_hub.__name__ == "unsloth_push_to_hub":
original_push_to_hub = model.original_push_to_hub
else:
original_push_to_hub = model.push_to_hub
pass
signature = str(inspect.signature(original_push_to_hub)).replace("NoneType", "None")
signature = signature[1:]
signature = re.sub("<function save at .+?>", "torch.save", signature)
docs = original_push_to_hub.__doc__.encode("utf-8").decode("utf-8")
push_to_hub_text = f'''def unsloth_push_to_hub(self, {signature}:
"""
{docs}
"""
arguments = dict(locals())
del arguments["self"]
if "tags" in arguments and arguments["tags"] is not None:
assert(isinstance(arguments["tags"], (list, tuple)))
arguments["tags"] = list(arguments["tags"]) + ["unsloth",]
elif "tags" in arguments:
arguments["tags"] = ["unsloth",]
elif hasattr(self, "add_model_tags"):
self.add_model_tags(["unsloth",])
if "commit_message" in arguments:
commit_message = arguments["commit_message"]
if commit_message is not None:
if not commit_message.endswith(" "): commit_message += " "
if "Unsloth" not in commit_message:
commit_message += "(Trained with Unsloth)"
else:
commit_message = "Upload model trained with Unsloth"
arguments["commit_message"] = commit_message
if "commit_description" in arguments:
commit_description = arguments["commit_description"]
if commit_description is not None:
if not commit_description.endswith(" "): commit_description += " "
if "Unsloth" not in commit_description:
commit_description += "(Trained with Unsloth 2x faster)"
else:
commit_description = "Upload model trained with Unsloth 2x faster"
arguments["commit_description"] = commit_description
# Update model tag
if hasattr(self, "config"):
_ = upload_to_huggingface(
self, arguments["repo_id"], arguments["token"],
"finetuned", "trl", file_location = None,
old_username = None, private = arguments["private"],
)
pass
try:
self.original_push_to_hub(**arguments)
except:
del arguments["tags"]
self.original_push_to_hub(**arguments)
pass
if hasattr(self, "config"):
print("Saved model to https://huggingface.co/" + arguments["repo_id"])
pass
'''
exec(push_to_hub_text, globals())
original_model = model
while True:
if original_model.push_to_hub.__name__ != "unsloth_push_to_hub":
original_model.original_push_to_hub = original_model.push_to_hub
original_model.push_to_hub = types.MethodType(unsloth_push_to_hub, original_model)
if hasattr(original_model, "add_model_tags"):
original_model.add_model_tags(["unsloth",])
pass
pass
if hasattr(original_model, "model"): original_model = original_model.model
else: break
pass
# Add saving methods to top level model
if hasattr(model, "config"):
# Counteract tokenizers
model.push_to_hub_merged = types.MethodType(unsloth_push_to_hub_merged, model)
model.save_pretrained_merged = types.MethodType(unsloth_save_pretrained_merged, model)
model.push_to_hub_gguf = types.MethodType(unsloth_push_to_hub_gguf, model)
model.save_pretrained_gguf = types.MethodType(unsloth_save_pretrained_gguf, model)
model.push_to_hub_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_push_to_hub, model)
model.save_pretrained_ggml = types.MethodType(unsloth_convert_lora_to_ggml_and_save_locally, model)
pass
return model
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment