Commit 0b5cd1a0 authored by liangjing's avatar liangjing
Browse files

update

parent 5352a639
Pipeline #1848 passed with stage
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[project]
name = "unsloth"
dynamic = ["version"]
description = "2-5X faster LLM finetuning"
readme = "README.md"
requires-python = ">=3.9"
license = {file = "LICENSE"}
keywords = ["ai", "llm",]
authors = [
{email = "info@unsloth.ai"},
{name = "Unsloth AI team"},
]
maintainers = [
{name = "Daniel Han", email = "danielhanchen@gmail.com"},
{name = "Michael Han", email = "info@unsloth.ai"},
]
classifiers = [
"Programming Language :: Python",
]
[tool.setuptools.dynamic]
version = {attr = "unsloth.models._utils.__version__"}
[tool.setuptools]
include-package-data = false
[tool.setuptools.packages.find]
exclude = ["images*"]
[project.optional-dependencies]
huggingface = [
"unsloth_zoo",
"packaging",
"tyro",
"transformers>=4.44.2",
"datasets>=2.16.0",
"sentencepiece>=0.2.0",
"tqdm",
"psutil",
"wheel>=0.42.0",
"numpy",
"accelerate>=0.34.1",
"trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
"peft>=0.7.1,!=0.11.0",
"protobuf<4.0.0",
"huggingface_hub",
"hf_transfer",
]
cu118only = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu121only = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu118onlytorch211 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu121onlytorch211 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu118onlytorch212 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu121onlytorch212 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu118onlytorch220 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu121onlytorch220 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
]
cu118onlytorch230 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
]
cu121onlytorch230 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
]
cu118onlytorch240 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12'",
]
cu121onlytorch240 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
]
cu124onlytorch240 = [
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
]
cu121onlytorch250 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
]
cu124onlytorch250 = [
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9'",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10'",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11'",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12'",
]
cu118 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118only]",
]
cu121 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121only]",
]
cu118-torch211 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch211]",
]
cu121-torch211 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch211]",
]
cu118-torch212 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch212]",
]
cu121-torch212 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch212]",
]
cu118-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch220]",
]
cu121-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch220]",
]
cu118-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch230]",
]
cu121-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch230]",
]
cu118-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch240]",
]
cu121-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch240]",
]
cu121-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch250]",
]
cu124-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu124onlytorch240]",
]
cu124-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu124onlytorch250]",
]
kaggle = [
"unsloth[huggingface]",
]
kaggle-new = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
]
conda = [
"unsloth[huggingface]",
]
colab-torch211 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch211]",
]
colab-ampere-torch211 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch211]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
colab-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch220]",
]
colab-ampere-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch220]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
colab-new = [
"unsloth_zoo",
"packaging",
"tyro",
"transformers>=4.44.2",
"datasets>=2.16.0",
"sentencepiece>=0.2.0",
"tqdm",
"psutil",
"wheel>=0.42.0",
"numpy",
"protobuf<4.0.0",
"huggingface_hub",
"hf_transfer",
"bitsandbytes>=0.43.3",
]
colab-no-deps = [
"accelerate>=0.34.1",
"trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3",
"peft>=0.7.1",
"xformers<0.0.27",
"bitsandbytes>=0.43.3",
"protobuf<4.0.0",
]
colab = [
"unsloth[cu121]",
]
colab-ampere = [
"unsloth[colab-ampere-torch220]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu118-ampere = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118only]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu121-ampere = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121only]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu118-ampere-torch211 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch211]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu121-ampere-torch211 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch211]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu118-ampere-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch220]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu121-ampere-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch220]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu118-ampere-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch230]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu121-ampere-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch230]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu118-ampere-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu118onlytorch240]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu121-ampere-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch240]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu121-ampere-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu121onlytorch250]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu124-ampere-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu124onlytorch240]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
cu124-ampere-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.43.3",
"unsloth[cu124onlytorch250]",
"packaging",
"ninja",
"flash-attn>=2.6.3",
]
[project.urls]
homepage = "http://www.unsloth.ai"
documentation = "https://github.com/unslothai/unsloth"
repository = "https://github.com/unslothai/unsloth"
#!/usr/bin/env python3
"""
🦥 Starter Script for Fine-Tuning FastLanguageModel with Unsloth
This script is designed as a starting point for fine-tuning your models using unsloth.
It includes configurable options for model loading, PEFT parameters, training arguments,
and model saving/pushing functionalities.
You will likely want to customize this script to suit your specific use case
and requirements.
Here are a few suggestions for customization:
- Modify the dataset loading and preprocessing steps to match your data.
- Customize the model saving and pushing configurations.
Usage: (most of the options have valid default values this is an extended example for demonstration purposes)
python unsloth-cli.py --model_name "unsloth/llama-3-8b" --max_seq_length 8192 --dtype None --load_in_4bit \
--r 64 --lora_alpha 32 --lora_dropout 0.1 --bias "none" --use_gradient_checkpointing "unsloth" \
--random_state 3407 --use_rslora --per_device_train_batch_size 4 --gradient_accumulation_steps 8 \
--warmup_steps 5 --max_steps 400 --learning_rate 2e-6 --logging_steps 1 --optim "adamw_8bit" \
--weight_decay 0.005 --lr_scheduler_type "linear" --seed 3407 --output_dir "outputs" \
--report_to "tensorboard" --save_model --save_path "model" --quantization_method "f16" \
--push_model --hub_path "hf/model" --hub_token "your_hf_token"
To see a full list of configurable options, use:
python unsloth-cli.py --help
Happy fine-tuning!
"""
import argparse
def run(args):
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import logging
logging.getLogger('hf-to-gguf').setLevel(logging.WARNING)
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_name,
max_seq_length=args.max_seq_length,
dtype=args.dtype,
load_in_4bit=args.load_in_4bit,
)
# Configure PEFT model
model = FastLanguageModel.get_peft_model(
model,
r=args.r,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
bias=args.bias,
use_gradient_checkpointing=args.use_gradient_checkpointing,
random_state=args.random_state,
use_rslora=args.use_rslora,
loftq_config=args.loftq_config,
)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return {"text": texts}
# Load and format dataset
dataset = load_dataset(args.dataset, split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)
print("Data is formatted and ready!")
# Configure training arguments
training_args = TrainingArguments(
per_device_train_batch_size=args.per_device_train_batch_size,
gradient_accumulation_steps=args.gradient_accumulation_steps,
warmup_steps=args.warmup_steps,
max_steps=args.max_steps,
learning_rate=args.learning_rate,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=args.logging_steps,
optim=args.optim,
weight_decay=args.weight_decay,
lr_scheduler_type=args.lr_scheduler_type,
seed=args.seed,
output_dir=args.output_dir,
report_to=args.report_to,
)
# Initialize trainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=args.max_seq_length,
dataset_num_proc=2,
packing=False,
args=training_args,
)
# Train model
trainer_stats = trainer.train()
# Save model
if args.save_model:
# if args.quantization_method is a list, we will save the model for each quantization method
if args.save_gguf:
if isinstance(args.quantization, list):
for quantization_method in args.quantization:
print(f"Saving model with quantization method: {quantization_method}")
model.save_pretrained_gguf(
args.save_path,
tokenizer,
quantization_method=quantization_method,
)
if args.push_model:
model.push_to_hub_gguf(
hub_path=args.hub_path,
hub_token=args.hub_token,
quantization_method=quantization_method,
)
else:
print(f"Saving model with quantization method: {args.quantization}")
model.save_pretrained_gguf(args.save_path, tokenizer, quantization_method=args.quantization)
if args.push_model:
model.push_to_hub_gguf(
hub_path=args.hub_path,
hub_token=args.hub_token,
quantization_method=quantization_method,
)
else:
model.save_pretrained_merged(args.save_path, tokenizer, args.save_method)
if args.push_model:
model.push_to_hub_merged(args.save_path, tokenizer, args.hub_token)
else:
print("Warning: The model is not saved!")
if __name__ == "__main__":
# Define argument parser
parser = argparse.ArgumentParser(description="🦥 Fine-tune your llm faster using unsloth!")
model_group = parser.add_argument_group("🤖 Model Options")
model_group.add_argument('--model_name', type=str, default="unsloth/llama-3-8b", help="Model name to load")
model_group.add_argument('--max_seq_length', type=int, default=2048, help="Maximum sequence length, default is 2048. We auto support RoPE Scaling internally!")
model_group.add_argument('--dtype', type=str, default=None, help="Data type for model (None for auto detection)")
model_group.add_argument('--load_in_4bit', action='store_true', help="Use 4bit quantization to reduce memory usage")
model_group.add_argument('--dataset', type=str, default="yahma/alpaca-cleaned", help="Huggingface dataset to use for training")
lora_group = parser.add_argument_group("🧠 LoRA Options", "These options are used to configure the LoRA model.")
lora_group.add_argument('--r', type=int, default=16, help="Rank for Lora model, default is 16. (common values: 8, 16, 32, 64, 128)")
lora_group.add_argument('--lora_alpha', type=int, default=16, help="LoRA alpha parameter, default is 16. (common values: 8, 16, 32, 64, 128)")
lora_group.add_argument('--lora_dropout', type=float, default=0, help="LoRA dropout rate, default is 0.0 which is optimized.")
lora_group.add_argument('--bias', type=str, default="none", help="Bias setting for LoRA")
lora_group.add_argument('--use_gradient_checkpointing', type=str, default="unsloth", help="Use gradient checkpointing")
lora_group.add_argument('--random_state', type=int, default=3407, help="Random state for reproducibility, default is 3407.")
lora_group.add_argument('--use_rslora', action='store_true', help="Use rank stabilized LoRA")
lora_group.add_argument('--loftq_config', type=str, default=None, help="Configuration for LoftQ")
training_group = parser.add_argument_group("🎓 Training Options")
training_group.add_argument('--per_device_train_batch_size', type=int, default=2, help="Batch size per device during training, default is 2.")
training_group.add_argument('--gradient_accumulation_steps', type=int, default=4, help="Number of gradient accumulation steps, default is 4.")
training_group.add_argument('--warmup_steps', type=int, default=5, help="Number of warmup steps, default is 5.")
training_group.add_argument('--max_steps', type=int, default=400, help="Maximum number of training steps.")
training_group.add_argument('--learning_rate', type=float, default=2e-4, help="Learning rate, default is 2e-4.")
training_group.add_argument('--optim', type=str, default="adamw_8bit", help="Optimizer type.")
training_group.add_argument('--weight_decay', type=float, default=0.01, help="Weight decay, default is 0.01.")
training_group.add_argument('--lr_scheduler_type', type=str, default="linear", help="Learning rate scheduler type, default is 'linear'.")
training_group.add_argument('--seed', type=int, default=3407, help="Seed for reproducibility, default is 3407.")
# Report/Logging arguments
report_group = parser.add_argument_group("📊 Report Options")
report_group.add_argument('--report_to', type=str, default="tensorboard",
choices=["azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", "wandb", "all", "none"],
help="The list of integrations to report the results and logs to. Supported platforms are: \n\t\t 'azure_ml', 'clearml', 'codecarbon', 'comet_ml', 'dagshub', 'dvclive', 'flyte', 'mlflow', 'neptune', 'tensorboard', and 'wandb'. Use 'all' to report to all integrations installed, 'none' for no integrations.")
report_group.add_argument('--logging_steps', type=int, default=1, help="Logging steps, default is 1")
# Saving and pushing arguments
save_group = parser.add_argument_group('💾 Save Model Options')
save_group.add_argument('--output_dir', type=str, default="outputs", help="Output directory")
save_group.add_argument('--save_model', action='store_true', help="Save the model after training")
save_group.add_argument('--save_method', type=str, default="merged_16bit", choices=["merged_16bit", "merged_4bit", "lora"], help="Save method for the model, default is 'merged_16bit'")
save_group.add_argument('--save_gguf', action='store_true', help="Convert the model to GGUF after training")
save_group.add_argument('--save_path', type=str, default="model", help="Path to save the model")
save_group.add_argument('--quantization', type=str, default="q8_0", nargs="+",
help="Quantization method for saving the model. common values ('f16', 'q4_k_m', 'q8_0'), Check our wiki for all quantization methods https://github.com/unslothai/unsloth/wiki#saving-to-gguf ")
push_group = parser.add_argument_group('🚀 Push Model Options')
push_group.add_argument('--push_model', action='store_true', help="Push the model to Hugging Face hub after training")
push_group.add_argument('--push_gguf', action='store_true', help="Push the model as GGUF to Hugging Face hub after training")
push_group.add_argument('--hub_path', type=str, default="hf/model", help="Path on Hugging Face hub to push the model")
push_group.add_argument('--hub_token', type=str, help="Token for pushing the model to Hugging Face hub")
args = parser.parse_args()
run(args)
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings, importlib, sys
from packaging.version import Version
import os, re, subprocess, inspect
import numpy as np
# # Define a list of modules to check
# MODULES_TO_CHECK = ["bitsandbytes"]
# # Check if any of the modules in the list have been imported
# for module in MODULES_TO_CHECK:
# if module in sys.modules:
# raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
# pass
# pass
# Check for unsloth_zoo
try:
import unsloth_zoo
except:
raise ImportError("Unsloth: Please install unsloth_zoo via `pip install unsloth-zoo`")
pass
# Unsloth currently does not work on multi GPU setups - sadly we are a 2 brother team so
# enabling it will require much more work, so we have to prioritize. Please understand!
# We do have a beta version, which you can contact us about!
# Thank you for your understanding and we appreciate it immensely!
if "CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
devices = os.environ["CUDA_VISIBLE_DEVICES"]
# Check if there are multiple cuda devices set in env
if not devices.isdigit():
first_id = devices.split(",")[0]
warnings.warn(
f"Unsloth: 'CUDA_VISIBLE_DEVICES' is currently {devices} \n"\
"Unsloth currently does not support multi GPU setups - but we are working on it!\n"\
"Multiple CUDA devices detected but we require a single device.\n"\
f"We will override CUDA_VISIBLE_DEVICES to first device: {first_id}."
)
os.environ["CUDA_VISIBLE_DEVICES"] = str(first_id)
else:
# warnings.warn("Unsloth: 'CUDA_VISIBLE_DEVICES' is not set. We shall set it ourselves.")
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
pass
# Reduce VRAM usage by reducing fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
try:
import torch
except ModuleNotFoundError:
raise ImportError(
"Unsloth: Pytorch is not installed. Go to https://pytorch.org/.\n"\
"We have some installation instructions on our Github page."
)
except Exception as exception:
raise exception
pass
# Hugging Face Hub faster downloads (only enable during Colab and Kaggle sessions)
keynames = "\n" + "\n".join(os.environ.keys())
if "\nCOLAB_" in keynames or "\nKAGGLE_" in keynames:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
pass
# We support Pytorch 2
# Fixes https://github.com/unslothai/unsloth/issues/38
torch_version = torch.__version__.split(".")
major_torch, minor_torch = torch_version[0], torch_version[1]
major_torch, minor_torch = int(major_torch), int(minor_torch)
if (major_torch < 2):
raise ImportError("Unsloth only supports Pytorch 2 for now. Please update your Pytorch to 2.1.\n"\
"We have some installation instructions on our Github page.")
elif (major_torch == 2) and (minor_torch < 2):
# Disable expandable_segments
del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
pass
# Torch 2.4 has including_emulation
major_version, minor_version = torch.cuda.get_device_capability()
SUPPORTS_BFLOAT16 = (major_version >= 8)
old_is_bf16_supported = torch.cuda.is_bf16_supported
if "including_emulation" in str(inspect.signature(old_is_bf16_supported)):
def is_bf16_supported(including_emulation = False):
return old_is_bf16_supported(including_emulation)
torch.cuda.is_bf16_supported = is_bf16_supported
else:
def is_bf16_supported(): return SUPPORTS_BFLOAT16
torch.cuda.is_bf16_supported = is_bf16_supported
pass
# Try loading bitsandbytes and triton
import bitsandbytes as bnb
if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
import triton
libcuda_dirs = lambda: None
if Version(triton.__version__) >= Version("3.0.0"):
try: from triton.backends.nvidia.driver import libcuda_dirs
except: pass
else: from triton.common.build import libcuda_dirs
try:
cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
libcuda_dirs()
except:
warnings.warn(
"Unsloth: Running `ldconfig /usr/lib64-nvidia` to link CUDA."\
)
if os.path.exists("/usr/lib64-nvidia"):
os.system("ldconfig /usr/lib64-nvidia")
elif os.path.exists("/usr/local"):
# Sometimes bitsandbytes cannot be linked properly in Runpod for example
possible_cudas = subprocess.check_output(["ls", "-al", "/usr/local"]).decode("utf-8").split("\n")
find_cuda = re.compile(r"[\s](cuda\-[\d\.]{2,})$")
possible_cudas = [find_cuda.search(x) for x in possible_cudas]
possible_cudas = [x.group(1) for x in possible_cudas if x is not None]
# Try linking cuda folder, or everything in local
if len(possible_cudas) == 0:
os.system("ldconfig /usr/local/")
else:
find_number = re.compile(r"([\d\.]{2,})")
latest_cuda = np.argsort([float(find_number.search(x).group(1)) for x in possible_cudas])[::-1][0]
latest_cuda = possible_cudas[latest_cuda]
os.system(f"ldconfig /usr/local/{latest_cuda}")
pass
importlib.reload(bnb)
importlib.reload(triton)
try:
libcuda_dirs = lambda: None
if Version(triton.__version__) >= Version("3.0.0"):
try: from triton.backends.nvidia.driver import libcuda_dirs
except: pass
else: from triton.common.build import libcuda_dirs
cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
libcuda_dirs()
except:
warnings.warn(
"Unsloth: CUDA is not linked properly.\n"\
"Try running `python -m bitsandbytes` then `python -m xformers.info`\n"\
"We tried running `ldconfig /usr/lib64-nvidia` ourselves, but it didn't work.\n"\
"You need to run in your terminal `sudo ldconfig /usr/lib64-nvidia` yourself, then import Unsloth.\n"\
"Also try `sudo ldconfig /usr/local/cuda-xx.x` - find the latest cuda version.\n"\
"Unsloth will still run for now, but maybe it might crash - let's hope it works!"
)
pass
pass
from .models import *
from .save import *
from .chat_templates import *
from .tokenizer_utils import *
from .trainer import *
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try: import torch
except: raise ImportError('Install torch via `pip install torch`')
from packaging.version import Version as V
v = V(torch.__version__)
cuda = str(torch.version.cuda)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!")
if v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
elif v < V('2.3.0'): x = 'cu{}{}-torch220'
elif v < V('2.4.0'): x = 'cu{}{}-torch230'
elif v < V('2.5.0'): x = 'cu{}{}-torch240'
elif v < V('2.6.0'): x = 'cu{}{}-torch250'
else: raise RuntimeError(f"Torch = {v} too new!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
\ No newline at end of file
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"get_chat_template",
"test_chat_templates",
"test_hf_gguf_equivalence",
"remove_special_tokens",
"to_sharegpt",
"standardize_sharegpt",
"apply_chat_template",
"train_on_responses_only",
"test_construct_chat_template",
]
from transformers import StoppingCriteria, StoppingCriteriaList
from torch import LongTensor, FloatTensor
from transformers.models.llama.modeling_llama import logger
from .save import patch_saving_functions
import os
import shutil
from .tokenizer_utils import *
from .models._utils import patch_tokenizer
import re
from unsloth_zoo.dataset_utils import (
train_on_responses_only,
)
CHAT_TEMPLATES = {}
# =========================================== Unsloth
# Unsloth efficient template leverages from Zephyr
unsloth_template = \
"{{ bos_token }}"\
"{% if messages[0]['role'] == 'system' %}"\
"{{ messages[0]['content'] + '\n' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% else %}"\
"{{ 'You are a helpful assistant to the user\n' }}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '>>> User: ' + message['content'] + '\n' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '>>> Assistant: ' }}"\
"{% endif %}"
pass
unsloth_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}{{ .System }}
{{ end }}{{ if .Prompt }}>>> User: {{ .Prompt }}
{{ end }}>>> Assistant: {{ .Response }}{__EOS_TOKEN__}
"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
SYSTEM """You are a helpful assistant to the user"""
'''
unsloth_eos_token = "eos_token"
CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False, unsloth_ollama,)
pass
# =========================================== Zephyr
# Zephyr has no BOS!
zephyr_template = \
"{% for message in messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '<|user|>\n' + message['content'] + eos_token + '\n' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}"\
"{% else %}"\
"{{ '<|system|>\n' + message['content'] + eos_token + '\n' }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '<|assistant|>\n' }}"\
"{% endif %}"
pass
zephyr_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}<|system|>
{{ .System }}{__EOS_TOKEN__}
{{ end }}{{ if .Prompt }}<|user|>
{{ .Prompt }}{__EOS_TOKEN__}
{{ end }}<|assistant|>
{{ .Response }}{__EOS_TOKEN__}
"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
zephyr_eos_token = "eos_token"
CHAT_TEMPLATES["zephyr"] = (zephyr_template, zephyr_eos_token, False, zephyr_ollama,)
pass
# =========================================== ChatML
# ChatML has no BOS and not EOS! Rather <|im_start|> and <|im_end|> acts as BOS / EOS.
chatml_template = \
"{% for message in messages %}"\
"{% if message['role'] == 'user' %}"\
"{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'}}"\
"{% elif message['role'] == 'assistant' %}"\
"{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' }}"\
"{% else %}"\
"{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '<|im_start|>assistant\n' }}"\
"{% endif %}"
pass
chatml_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ .Response }}<|im_end|>
"""
PARAMETER stop "<|im_start|>"
PARAMETER stop "<|im_end|>"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
chatml_eos_token = "<|im_end|>"
CHAT_TEMPLATES["chatml"] = (chatml_template, chatml_eos_token, True, chatml_ollama,)
pass
# =========================================== Mistral-1
# Mistral Instruct doesn't allow system prompts, so we append it to the user message.
mistral_template = \
"{{ bos_token }}"\
"{% if messages[0]['role'] == 'system' %}"\
"{% if messages[1]['role'] == 'user' %}"\
"{{ '[INST] ' + messages[0]['content'] + ' ' + messages[1]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[2:] %}"\
"{% else %}"\
"{{ '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% endif %}"\
"{% else %}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '[INST] ' + message['content'] + ' [/INST]' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ message['content'] + eos_token }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"
pass
# Ollama from https://www.ollama.com/library/mistral
mistral_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
mistral_eos_token = "eos_token"
CHAT_TEMPLATES["mistral"] = (mistral_template, mistral_eos_token, False, mistral_ollama,)
pass
# =========================================== Llama-2
# Adds BOS to every convo! And weird <<SYS>> system messages.
llama_template = \
"{% if messages[0]['role'] == 'system' %}"\
"{% if messages[1]['role'] == 'user' %}"\
"{{ bos_token + '[INST] <<SYS>>\n' + messages[0]['content'] + '\n<</SYS>>\n\n' + messages[1]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[2:] %}"\
"{% else %}"\
"{{ bos_token + '[INST] ' + messages[0]['content'] + ' [/INST]' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% endif %}"\
"{% else %}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ ' ' + message['content'].strip() + ' ' + eos_token }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"
pass
# Ollama from https://www.ollama.com/library/llama3
llama_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """[INST] <<SYS>>{{ .System }}<</SYS>>
{{ .Prompt }} [/INST]"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
llama_eos_token = "eos_token"
CHAT_TEMPLATES["llama"] = (llama_template, llama_eos_token, False, llama_ollama,)
pass
# =========================================== Vicuna
# https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
vicuna_template = \
"{{ bos_token }}"\
"{% if messages[0]['role'] == 'system' %}"\
"{{ messages[0]['content'] + ' ' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% else %}"\
"{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' + ' ' }}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ 'USER: ' + message['content'] + ' ' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ 'ASSISTANT: ' + message['content'] + eos_token }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ 'ASSISTANT:' }}"\
"{% endif %}"
pass
# Ollama from https://www.ollama.com/library/vicuna
vicuna_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} {__EOS_TOKEN__}"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
vicuna_eos_token = "eos_token"
CHAT_TEMPLATES["vicuna"] = (vicuna_template, vicuna_eos_token, False, vicuna_ollama,)
pass
# =========================================== Vicuna Old
# https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
vicuna_old_template = \
"{{ bos_token }}"\
"{% if messages[0]['role'] == 'system' %}"\
"{{ messages[0]['content'] + '\n' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% else %}"\
"{{ 'A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\\'s questions.' + '\n' }}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '### Human: ' + message['content'] + '\n' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ '### Assistant: ' + message['content'] + eos_token + '\n' }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '### Assistant:' }}"\
"{% endif %}"
pass
vicuna_old_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}{{ .System }}
{{ end }}{{ if .Prompt }}### Human: {{ .Prompt }}
{{ end }}### Assistant: {{ .Response }}{__EOS_TOKEN__}
"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
SYSTEM """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."""
'''
vicuna_old_eos_token = "eos_token"
CHAT_TEMPLATES["vicuna_old"] = (vicuna_old_template, vicuna_old_eos_token, False, vicuna_old_ollama,)
pass
# =========================================== Alpaca multi turn
# https://github.com/tatsu-lab/stanford_alpaca Changed for multi-turn convos
alpaca_template = \
"{{ bos_token }}"\
"{% if messages[0]['role'] == 'system' %}"\
"{{ messages[0]['content'] + '\n\n' }}"\
"{% set loop_messages = messages[1:] %}"\
"{% else %}"\
"{{ 'Below are some instructions that describe some tasks. Write responses that appropriately complete each request.\n\n' }}"\
"{% set loop_messages = messages %}"\
"{% endif %}"\
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '### Instruction:\n' + message['content'] + '\n\n' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ '### Response:\n' + message['content'] + eos_token + '\n\n' }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '### Response:\n' }}"\
"{% endif %}"
pass
alpaca_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}{{ .System }}
{{ end }}{{ if .Prompt }}### Instruction:
{{ .Prompt }}{{ end }}
### Response:
{{ .Response }}{__EOS_TOKEN__}
"""
PARAMETER stop "{__EOS_TOKEN__}"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
SYSTEM """Below are some instructions that describe some tasks. Write responses that appropriately complete each request."""
'''
alpaca_eos_token = "eos_token"
CHAT_TEMPLATES["alpaca"] = (alpaca_template, alpaca_eos_token, False, alpaca_ollama,)
pass
# =========================================== Gemma
# https://huggingface.co/google/gemma-7b-it
# Notice we must use |trim for lstrip and rstrip. <start_of_turn> maps to 106.
# <end_of_turn> maps to 107. user and model are normal 1 word tokens.
gemma_template = \
"{{ bos_token }}"\
"{% if messages[0]['role'] == 'system' %}"\
"{{'<start_of_turn>user\n' + messages[0]['content'] | trim + ' ' + messages[1]['content'] | trim + '<end_of_turn>\n'}}"\
"{% set loop_messages = messages[2:] %}"\
"{% endif %}"\
"{% for message in messages %}"\
"{% if message['role'] == 'user' %}"\
"{{'<start_of_turn>user\n' + message['content'] | trim + '<end_of_turn>\n'}}"\
"{% elif message['role'] == 'assistant' %}"\
"{{'<start_of_turn>model\n' + message['content'] | trim + '<end_of_turn>\n' }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '<start_of_turn>model\n' }}"\
"{% endif %}"
pass
# Ollama from https://www.ollama.com/library/gemma
gemma_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """<start_of_turn>user
{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}<end_of_turn>
<start_of_turn>model
{{ .Response }}<end_of_turn>
"""
PARAMETER repeat_penalty 1
PARAMETER stop "<start_of_turn>"
PARAMETER stop "<end_of_turn>"
PARAMETER penalize_newline false
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
gemma_eos_token = "<end_of_turn>"
CHAT_TEMPLATES["gemma"] = (gemma_template, gemma_eos_token, True, gemma_ollama,)
pass
# =========================================== Gemma with ChatML instead
# We find using <eos> is still more appropriate!
gemma_chatml_template = "{{ bos_token }}" + chatml_template
pass
gemma_chatml_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ .Response }}<|im_end|>
"""
PARAMETER repeat_penalty 1
PARAMETER stop "<|im_start|>"
PARAMETER stop "<|im_end|>"
PARAMETER penalize_newline false
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
gemma_chatml_eos_token = (
{"<start_of_turn>" : "<|im_start|>", "<eos>" : "<|im_end|>"},
"<|im_end|>",
)
CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True, gemma_chatml_ollama,)
pass
# =========================================== Gemma 2
# Same as Gemma 1, but with sliding window attention!
# https://ollama.com/library/gemma2/blobs/6522ca797f47
gemma2_template = gemma_template
gemma2_ollama = gemma_ollama + "PARAMETER num_ctx 4096\n"
gemma2_eos_token = "<end_of_turn>"
CHAT_TEMPLATES["gemma2"] = (gemma2_template, gemma2_eos_token, True, gemma2_ollama,)
# =========================================== Gemma 2 with ChatML instead
gemma2_chatml_template = gemma_chatml_template
gemma2_chatml_ollama = gemma_chatml_ollama + "PARAMETER num_ctx 4096\n"
gemma2_chatml_eos_token = gemma_chatml_eos_token
CHAT_TEMPLATES["gemma2_chatml"] = (gemma2_chatml_template, gemma2_chatml_eos_token, True, gemma2_chatml_ollama,)
pass
# =========================================== Llama-3
# Weirdly \n\n is needed?
llama3_template = \
"{{ bos_token }}"\
"{% for message in messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
"{% else %}"\
"{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
"{% endif %}"
pass
# Ollama from https://www.ollama.com/library/llama3
llama3_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
{{ .Response }}<|eot_id|>"""
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|eot_id|>"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
llama3_template_eos_token = "eos_token"
CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token, False, llama3_ollama,)
pass
# =========================================== Phi-3
# "{{ bos_token }}"\ # Phi-3.5 removes BOS?
phi3_template = \
"{% for message in messages %}"\
"{% if message['role'] == 'user' %}"\
"{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\
"{% elif message['role'] == 'assistant' %}"\
"{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}"\
"{% else %}"\
"{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '<|assistant|>\n' }}"\
"{% endif %}"
pass
# Ollama from https://www.ollama.com/library/phi3
phi3_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .System }}<|system|>
{{ .System }}<|end|>
{{ end }}{{ if .Prompt }}<|user|>
{{ .Prompt }}<|end|>
{{ end }}<|assistant|>
{{ .Response }}<|end|>
"""
PARAMETER stop "<|end|>"
PARAMETER stop "<|user|>"
PARAMETER stop "<|assistant|>"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
phi3_template_eos_token = "<|end|>"
CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token, False, phi3_ollama,)
CHAT_TEMPLATES["phi-35"] = CHAT_TEMPLATES["phi-3"]
CHAT_TEMPLATES["phi-3.5"] = CHAT_TEMPLATES["phi-3"]
pass
# =========================================== Llama-3.1
"""
No trimming in Llama 3.1 Instruct!
Also an extra newline for Cutting Knowledge Date
See https://colab.research.google.com/drive/1Xpqq5xpIgO-B00MQ-UccYMwN2J8QFgBM?usp=sharing
Also should be
import datetime
tokenizer.apply_chat_template(
messages,
add_generation_prompt = True,
tokenize = False,
date_string = datetime.today().strftime("%d %B %Y")),
)
"""
llama31_template = \
"""{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
{%- set date_string = "26 July 2024" %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content'] %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "" %}
{%- endif %}
{#- System message + builtin tools #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if builtin_tools is defined or tools is not none %}
{{- "Environment: ipython\n" }}
{%- endif %}
{%- if builtin_tools is defined %}
{{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- system_message }}
{{- "<|eot_id|>" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0]['content'] %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
{{- "Given the following functions, please respond with a JSON for a function call " }}
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- first_user_message + "<|eot_id|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
{%- elif 'tool_calls' in message %}
{%- if not message.tool_calls|length == 1 %}
{{- raise_exception("This model only supports single tool-calls at once!") }}
{%- endif %}
{%- set tool_call = message.tool_calls[0].function %}
{%- if builtin_tools is defined and tool_call.name in builtin_tools %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- "<|python_tag|>" + tool_call.name + ".call(" }}
{%- for arg_name, arg_val in tool_call.arguments | items %}
{{- arg_name + '="' + arg_val + '"' }}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- ")" }}
{%- else %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- '{"name": "' + tool_call.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.arguments | tojson }}
{{- "}" }}
{%- endif %}
{%- if builtin_tools is defined %}
{#- This means we're in ipython mode #}
{{- "<|eom_id|>" }}
{%- else %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
{%- if message.content is mapping or message.content is iterable %}
{{- message.content | tojson }}
{%- else %}
{{- message.content }}
{%- endif %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
"""
pass
# Ollama from https://ollama.com/library/llama3.1 (needs updating!)
llama31_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{ if .Messages }}
{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
{{- if .System }}
{{ .System }}
{{- end }}
{{- if .Tools }}
You are a helpful assistant with tool calling capabilities. When you receive a tool call response, use the output to format an answer to the original use question.
{{- end }}
{{- end }}<|eot_id|>
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 }}
{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
{{- if and $.Tools $last }}
Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
{{ $.Tools }}
{{- end }}
{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
{{ end }}
{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
{{- if .ToolCalls }}
{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
{{- else }}
{{ .Content }}{{ if not $last }}<|eot_id|>{{ end }}
{{- end }}
{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
{{ end }}
{{- end }}
{{- end }}
{{- else }}
{{- if .System }}<|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
{{ end }}{{ .Response }}{{ if .Response }}<|eot_id|>{{ end }}"""
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|eom_id|>"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
llama31_template_eos_token = "eos_token"
CHAT_TEMPLATES["llama-3.1"] = (llama31_template, llama31_template_eos_token, False, llama31_ollama,)
CHAT_TEMPLATES["llama-31"] = (llama31_template, llama31_template_eos_token, False, llama31_ollama,)
pass
# =========================================== Qwen 2.5
qwen25_template = \
"""{%- if tools %}
{{- \'<|im_start|>system\\n\' }}
{%- if messages[0][\'role\'] == \'system\' %}
{{- messages[0][\'content\'] }}
{%- else %}
{{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}
{%- endif %}
{{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}
{%- for tool in tools %}
{{- "\\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}
{%- if messages[0][\'role\'] == \'system\' %}
{{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}
{%- else %}
{{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}
{%- endif %}\n{%- endif %}\n{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
{{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}
{%- elif message.role == "assistant" %}
{{- \'<|im_start|>\' + message.role }}
{%- if message.content %}
{{- \'\\n\' + message.content }}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- \'\\n<tool_call>\\n{"name": "\' }}
{{- tool_call.name }}
{{- \'", "arguments": \' }}
{{- tool_call.arguments | tojson }}
{{- \'}\\n</tool_call>\' }}
{%- endfor %}
{{- \'<|im_end|>\\n\' }}
{%- elif message.role == "tool" %}
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %} {{- \'<|im_start|>user\' }}
{%- endif %}
{{- \'\\n<tool_response>\\n\' }}
{{- message.content }}
{{- \'\\n</tool_response>\' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- \'<|im_end|>\\n\' }}
{%- endif %}
{%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}
{{- \'<|im_start|>assistant\\n\' }}
{%- endif %}
"""
# Ollama from https://ollama.com/library/qwen2.5/blobs/eb4402837c78
qwen25_ollama = \
'''
FROM {__FILE_LOCATION__}
TEMPLATE """{{- if .Messages }}
{{- if or .System .Tools }}<|im_start|>system
{{- if .System }}
{{ .System }}
{{- end }}
{{- if .Tools }}
# Tools
You may call one or more functions to assist with the user query.
You are provided with function signatures within <tools></tools> XML tags:
<tools>
{{- range .Tools }}
{"type": "function", "function": {{ .Function }}}
{{- end }}
</tools>
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
<tool_call>
{"name": <function-name>, "arguments": <args-json-object>}
</tool_call>
{{- end }}<|im_end|>
{{ end }}
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 -}}
{{- if eq .Role "user" }}<|im_start|>user
{{ .Content }}<|im_end|>
{{ else if eq .Role "assistant" }}<|im_start|>assistant
{{ if .Content }}{{ .Content }}
{{- else if .ToolCalls }}<tool_call>
{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
{{ end }}</tool_call>
{{- end }}{{ if not $last }}<|im_end|>
{{ end }}
{{- else if eq .Role "tool" }}<|im_start|>user
<tool_response>
{{ .Content }}
</tool_response><|im_end|>
{{ end }}
{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
{{ end }}
{{- end }}
{{- else }}
{{- if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}"""
PARAMETER stop "<|im_end|>"
PARAMETER stop "<|endoftext|>"
PARAMETER temperature 1.5
PARAMETER min_p 0.1
'''
qwen25_template_eos_token = "eos_token"
CHAT_TEMPLATES["qwen-2.5"] = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
CHAT_TEMPLATES["qwen-25"] = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
CHAT_TEMPLATES["qwen25"] = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
CHAT_TEMPLATES["qwen2.5"] = (qwen25_template, qwen25_template_eos_token, False, qwen25_ollama,)
pass
def get_chat_template(
tokenizer,
chat_template = "chatml",
mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
map_eos_token = True,
system_message = None,
):
assert(type(map_eos_token) is bool)
old_tokenizer = tokenizer
IS_GEMMA = False
if tokenizer.__class__.__name__.startswith("Gemma"):
if chat_template == "chatml": chat_template = "gemma_chatml"
IS_GEMMA = True
pass
# We add a check for Llama-3
# if chat_template == "llama-3":
# tokenizer._using_llama3_template = True
# else:
# llama3_tokens = set(["<|end_header_id|>", "<|eot_id|>", "<|start_header_id|>"])
# check_llama3_tokens = llama3_tokens & set(str(x) for x in tokenizer.added_tokens_decoder.values())
# if len(check_llama3_tokens) == len(llama3_tokens):
# tokenizer._using_llama3_template = True
# pass
# pass
# We first check if the tokenizer is a fast one. If not, we cannot convert this!
is_fast_tokenizer = getattr(tokenizer, "is_fast", False)
old_padding_side = tokenizer.padding_side
same_padding_token = False
if type(chat_template) in (list, tuple,):
chat_template, stop_word = chat_template
assert(type(chat_template) is str)
assert(type(stop_word) is str)
ollama_modelfile = None
elif type(chat_template) is str:
chat_template, stop_word, yes_map_eos_token, ollama_modelfile = CHAT_TEMPLATES[chat_template]
# Check mapping to eos_token
if not map_eos_token and yes_map_eos_token: map_eos_token = True
if not yes_map_eos_token and map_eos_token: map_eos_token = False
if type(stop_word) in (list, tuple,):
token_mapping, stop_word = stop_word
assert(type(token_mapping) is dict)
else:
token_mapping = None
assert(type(stop_word) is str)
# Check fast tokenizer
if not is_fast_tokenizer:
print(
"Unsloth: Not a fast tokenizer, so can't process it as of yet :(\n"\
"Please log a Github issue if you want this as a new feature!\n"\
"Your chat template will still work, but it won't add or edit tokens."
)
elif token_mapping is not None:
# token_mapping = {"<start_of_turn>" : "<|im_start|>", "<end_of_turn>" : "<|im_end|>"}
# For Gemma :)
string_vocab = tokenizer._tokenizer.to_str()
skipped = 0
for old_token, new_token in token_mapping.items():
old_count = string_vocab.count(f'"{old_token}"')
new_count = string_vocab.count(f'"{new_token}"')
if new_count != 0:
print(f"{new_token} is already a token. Skipping.")
skipped += 1
elif old_count == 0:
raise RuntimeError(f"{old_token} was not part of the tokenizer!")
else:
string_vocab = string_vocab.replace(f'"{old_token}"', f'"{new_token}"')
pass
pass
if map_eos_token and (not stop_word in token_mapping.values()):
# Do not map 107 = <|im_end|> and 1 = <|im_end|>. This will reduce the vocab size by 1
logger.warning_once(f"Unsloth: Will map {stop_word} to EOS = {tokenizer.eos_token}.")
string_vocab = string_vocab.replace(tokenizer.eos_token, stop_word)
pass
if skipped != len(token_mapping):
new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
# Careful on pad_token
old_pad_token = tokenizer.pad_token
if old_pad_token == tokenizer.eos_token:
old_pad_token = stop_word
same_padding_token = True
pass
if map_eos_token:
new_tokenizer = tokenizer.__class__(
tokenizer_object = new_tokenizer,
eos_token = stop_word,
pad_token = old_pad_token,
)
else:
new_tokenizer = tokenizer.__class__(
tokenizer_object = new_tokenizer,
pad_token = old_pad_token,
)
pass
# Must fix the sentence piece tokenizer since there's no tokenizer.model file!
tokenizer = fix_sentencepiece_tokenizer(tokenizer, new_tokenizer, token_mapping,)
else:
pass
elif map_eos_token and (stop_word != "eos_token"):
logger.warning_once(f"Unsloth: Will map {stop_word} to EOS = {tokenizer.eos_token}.")
# Replaces the old EOS token with a new one.
# Useful for ChatML <|im_end|> for example.
# Usually we train 2 more tokens <|im_start|> and <|im_end|>
# But training the lm_head and embeddings are slow!
# This is a HACK!
# Idea from https://huggingface.co/cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser
old_bos_token = getattr(tokenizer, "bos_token", None)
old_eos_token = getattr(tokenizer, "eos_token", None)
old_pad_token = getattr(tokenizer, "pad_token", None)
old_unk_token = getattr(tokenizer, "unk_token", None)
string_vocab = tokenizer._tokenizer.to_str()
# First check if new stop_word is in the tokenizer
if stop_word in string_vocab:
# We shall swap them around
temporary_stop_token = "<|:__TEMP//STOP//TOKEN__:|>"
string_vocab = string_vocab.replace(old_eos_token, temporary_stop_token)
string_vocab = string_vocab.replace(stop_word, old_eos_token)
string_vocab = string_vocab.replace(temporary_stop_token, stop_word)
else:
string_vocab = string_vocab.replace(old_eos_token, stop_word)
pass
new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
# Careful on pad_token
if old_pad_token == old_eos_token:
old_pad_token = stop_word
same_padding_token = True
pass
new_tokenizer = tokenizer.__class__(
tokenizer_object = new_tokenizer,
bos_token = old_bos_token,
eos_token = stop_word,
unk_token = old_unk_token,
pad_token = old_pad_token,
)
# Must fix the sentence piece tokenizer since there's no tokenizer.model file!
token_mapping = { old_eos_token : stop_word, }
tokenizer = fix_sentencepiece_tokenizer(tokenizer, new_tokenizer, token_mapping,)
pass
else:
raise TypeError(
f"Unsloth: `chat_template` must be a tuple of (your_template, eos_token,) or one of\n"\
f"{CHAT_TEMPLATES.keys()}"
)
pass
# Careful on Gemma
# bos_token is a must or else losses become too high
if IS_GEMMA and not chat_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
chat_template = "{{ bos_token }}" + chat_template
pass
# For ShareGPT role -> from and content -> value
new_chat_template = chat_template\
.replace("'role'", "'" + mapping["role"] + "'")\
.replace("'content'", "'" + mapping["content"] + "'")\
.replace("'user'", "'" + mapping["user"] + "'")\
.replace("'assistant'", "'" + mapping["assistant"] + "'")
_, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer)
tokenizer.padding_side = old_padding_side
# If not normal HF, we add a check to make old templates work
if mapping != {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}:
chat_template = \
"{% if 'role' in messages[0] %}" + \
chat_template + \
"{% else %}" + \
new_chat_template + \
"{% endif %}"
else:
chat_template = new_chat_template
pass
tokenizer.chat_template = chat_template
# Also fix up other tokens
old_pad_token = getattr(old_tokenizer, "pad_token", None)
old_bos_token = getattr(old_tokenizer, "bos_token", None)
old_unk_token = getattr(old_tokenizer, "unk_token", None)
new_pad_token = getattr(tokenizer, "pad_token", None)
new_bos_token = getattr(tokenizer, "bos_token", None)
new_unk_token = getattr(tokenizer, "unk_token", None)
if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token
if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token
if not same_padding_token:
if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token
pass
# stopping_criteria = create_stopping_criteria(tokenizer, stop_word)
# Patch saving functions
tokenizer = patch_saving_functions(tokenizer)
# Add Ollama
tokenizer._ollama_modelfile = ollama_modelfile
tokenizer._system_message = system_message
return tokenizer#, stopping_criteria
pass
def remove_special_tokens(tokenizer, prompt):
# Removes double BOS token
if prompt.startswith(tokenizer.bos_token):
prompt = prompt[len(tokenizer.bos_token):]
pass
return prompt
pass
def _parse_combined_prompt(combined_prompt, dataset):
# Find {...}
possible_columns = re.findall(r"\{(.+?)\}", combined_prompt)
dataset_columns = set(dataset.column_names)
for column in possible_columns:
if column not in dataset_columns:
raise KeyError(
f"Unsloth: Your prompt includes '{column}' but this does not exist in the dataset. "\
f"Only allowed columns are {list(dataset_columns)}"
)
pass
pass
# Find [[...]]
optional_prompts = list(re.finditer(r"\[\[.+?\]\]", combined_prompt, flags = re.DOTALL | re.MULTILINE))
optional_prompts = [(x.span(), x.group(0)) for x in optional_prompts]
final_optional_prompts = []
if len(optional_prompts) != 0:
# Add left
left = optional_prompts[0]
l = left[0][0]
if l != 0: final_optional_prompts.append(combined_prompt[:l])
# Add in between
for left, right in zip(optional_prompts[:-1], optional_prompts[1:]):
l, r = left[0][-1], right[0][0]
final_optional_prompts.append(left)
if l != r: final_optional_prompts.append(combined_prompt[l : r])
pass
final_optional_prompts.append(optional_prompts[-1])
# Add right
right = optional_prompts[-1]
r = right[0][1]
if r != len(combined_prompt): final_optional_prompts.append(combined_prompt[r:])
else:
# Just add in the entire string
final_optional_prompts.append(combined_prompt)
pass
check_combined = "".join(x if type(x) is str else x[1] for x in final_optional_prompts)
assert(combined_prompt == check_combined)
return possible_columns, final_optional_prompts
pass
def _create_formatter(possible_columns, final_optional_prompts, user_column_name):
# Start final prompt!
function = ["def __combined_prompt_processor__(examples):"]
columns = list(set(possible_columns))
for column in columns:
function.append(f"{' '*4}{column}__ = examples['{column}']")
function.append(f"{' '*4}texts = []")
function.append(f"{' '*4}for ({', '.join(columns)}) in zip({', '.join(f'{x}__' for x in columns)}):")
# Add optional tags as well!
final_prompt = ""
formatter = []
for j, optional_prompt in enumerate(final_optional_prompts):
if type(optional_prompt) is str:
columns = re.findall(r"\{(.+?)\}", optional_prompt)
formatter += columns
# Must escape \n \r
final_prompt += optional_prompt.encode("unicode-escape").decode("utf-8").replace("'", "\\'").replace('"', '\\"')
else:
where, prompt = optional_prompt
# Strip [[...]]
# Must escape \n \r
prompt = prompt[2:-2].encode("unicode-escape").decode("utf-8").replace("'", "\\'").replace('"', '\\"')
columns = re.findall(r"\{(.+?)\}", prompt)
x = f"__optional_{j}__"
prompt = f"{' '*8}{x} = '{prompt}'.format({', '.join(f'{x} = {x}' for x in columns)}) if {columns[0]} else ''"
function.append(prompt)
formatter.append(x)
final_prompt += "{" + x + "}"
pass
pass
function.insert(1, f"{' '*4}__combined_prompt__ = '{final_prompt}'")
function.append(f"{' '*8}texts.append("\
f"__combined_prompt__.format({', '.join(f'{x} = {x}' for x in formatter)}))")
function.append(f"{' '*4}return " + "{ " + f"'{user_column_name}' : texts" + " }")
return "\n".join(function)
pass
def to_sharegpt(
dataset,
merged_prompt = "",
merged_column_name = "instruction",
output_column_name = "output",
remove_unused_columns = True,
conversation_extension = 1,
random_state = 3407,
):
"""
Converts a dataset to ShareGPT style.
ShareGPT requires only 1 input and 1 output field.
This means one has to merge multiple columns into 1 for 1 input field.
Use `conversation_extension` to increase the length of each conversation by randomnly
selecting a few and packing them into 1.
merged_prompt = "", Prompt to merge columns into 1 input
merged_column_name = "instruction", Final column name for the input field
output_column_name = "output", Final column name for the output field
remove_unused_columns = True,
conversation_extension = 1, Automatically combines `conversation_extension` convos into 1
random_state = 3407,
"""
if "conversations" in dataset.column_names:
convo = dataset[0]["conversations"]
if type(convo) is list:
raise TypeError("Unsloth: Your dataset is probably already in ShareGPT format!")
pass
pass
possible_columns, final_optional_prompts = _parse_combined_prompt(merged_prompt, dataset)
function = _create_formatter(possible_columns, final_optional_prompts, merged_column_name)
exec(function, globals())
dataset = dataset.map(__combined_prompt_processor__, batched = True, desc = "Merging columns")
def __convert_to_sharegpt__(examples):
users = examples[merged_column_name]
assistants = examples[output_column_name]
texts = [
[
{"from" : "human", "value" : str(user) },
{"from" : "gpt", "value" : str(assistant)},
] \
for user, assistant in zip(users, assistants)
]
return { "conversations" : texts, }
pass
dataset = dataset.map(
__convert_to_sharegpt__,
batched = True,
desc = "Converting to ShareGPT",
# Remove unused columns!
remove_columns = dataset.column_names if remove_unused_columns else None,
)
# Randomnly concat conversations to create a long stream!
from datasets import concatenate_datasets
n_extensions = max(conversation_extension-1, 0)
if n_extensions == 0: return dataset
dataset = dataset.rename_columns({"conversations" : "conversations0"})
all_shuffled = [dataset]
for j in range(1, n_extensions+1):
shuffled = dataset.shuffle(seed = random_state+j).rename_columns({"conversations0" : f"conversations{j}"})
all_shuffled.append(shuffled)
pass
dataset = concatenate_datasets(all_shuffled, axis = 1)
# Combine them into 1
function = "def __combine_conversations__(examples):\n"
n_extensions += 1
for j in range(n_extensions):
function += f"{' '*4}conversations{j}__ = examples['conversations{j}']\n"
function += f"{' '*4}convos = []\n"
function += f"{' '*4}for ({', '.join(f'conversations{j}' for j in range(n_extensions))}) "\
f"in zip({', '.join(f'conversations{j}__' for j in range(n_extensions))}):\n"
function += f"{' '*8}convos.append("\
f"{'+'.join(f'conversations{j}' for j in range(n_extensions))})\n"
function += f"{' '*4}return " + "{ " + "'conversations' : convos" + " }"
# Map function
exec(function, globals())
dataset = dataset.map(
__combine_conversations__,
batched = True,
desc = "Extending conversations",
# Remove unused columns!
remove_columns = dataset.column_names if remove_unused_columns else None,
)
return dataset
pass
def standardize_sharegpt(
dataset,
aliases_for_system = ["system",],
aliases_for_user = ["user", "human", "input",],
aliases_for_assistant = ["gpt", "assistant", "output",],
):
"""
Standardizes ShareGPT and other formats to user/assistant Hugging Face format.
Get aliases for the system, user and assistant roles.
These shall map to "system", "user" and "assistant" respectively.
aliases_for_system = ["system",],
aliases_for_user = ["user", "human", "input",],
aliases_for_assistant = ["gpt", "assistant", "output",],
"""
import collections
import itertools
convos = dataset[:10]["conversations"]
uniques = collections.defaultdict(list)
for convo in convos:
for message in convo:
for key, value in message.items():
uniques[key].append(value)
pass
# Must be only 2 entries
assert(len(uniques.keys()) == 2)
keys = list(uniques.keys())
length_first = len(set(uniques[keys[0]]))
length_second = len(set(uniques[keys[1]]))
if length_first < length_second:
# Role is assigned to the first element
role_key = keys[0]
content_key = keys[1]
else:
role_key = keys[1]
content_key = keys[0]
pass
# Check roles are in aliases
all_aliases = set(aliases_for_system + aliases_for_user + aliases_for_assistant)
roles = set(uniques[role_key])
leftover_aliases = (all_aliases | roles) - all_aliases
if len(leftover_aliases) != 0:
raise TypeError(
f"Unsloth: {list(leftover_aliases)} are not in aliases. Please update aliases."
)
pass
# Mapping for aliases
aliases_mapping = {}
for x in aliases_for_system: aliases_mapping[x] = "system"
for x in aliases_for_user: aliases_mapping[x] = "user"
for x in aliases_for_assistant: aliases_mapping[x] = "assistant"
def _standardize_dataset(examples):
convos = examples["conversations"]
all_convos = []
for convo in convos:
new_convo = [
{ "role" : aliases_mapping[message[role_key]], "content" : message[content_key], }
for message in convo
]
all_convos.append(new_convo)
pass
return { "conversations" : all_convos, }
pass
return dataset.map(_standardize_dataset, batched = True, desc = "Standardizing format")
pass
def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []):
added_tokens_decoder = tokenizer.added_tokens_decoder.values()
added_tokens_decoder = [str(x) for x in added_tokens_decoder]
# Remove added_tokens_decoder duplicates
added_tokens_decoder = list(set(added_tokens_decoder) - set(extra_eos_tokens))
# Remove BOS
if getattr(tokenizer, "bos_token", None) is not None:
added_tokens_decoder = [x for x in added_tokens_decoder if x != tokenizer.bos_token]
pass
repeatted_tokens = []
# Join all vocab
joined_text = "\x01\x00".join(added_tokens_decoder)
for token in added_tokens_decoder:
n = len(token)
repeatted_counts = joined_text.count(token[:n//2])
# Try finding longer than 1/2 of the token in the rest
# For eg <|reserved_special_token_0|>, <|reserved_special_token_1|>
if repeatted_counts > 2:
for j in range(n//2+1, n):
if joined_text.count(token[:j]) < repeatted_counts:
j -= 1
# Remove repeatted tokens to reduce search space
joined_text = joined_text.replace(token[:j], "")
repeatted_tokens.append(token[:j])
break
pass
pass
pass
# Remove duplicates
splitted = joined_text.split("\x01\x00")
final_eos_tokens = []
for old, new in zip(added_tokens_decoder, splitted):
if old == new: final_eos_tokens.append(old)
pass
final_eos_tokens += extra_eos_tokens
final_eos_tokens += repeatted_tokens
# Remove new lines, spaces and HTML tags
filtered_eos_tokens = []
for token in final_eos_tokens:
if token.count("\n") == len(token): continue
elif token.count("▁") == len(token): continue
elif token.startswith("<") and len(token) <= 2: continue
elif token.startswith("</") and len(token) == 3: continue
filtered_eos_tokens.append(token)
pass
return filtered_eos_tokens
pass
def construct_chat_template( \
tokenizer = None,
chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|>""",
default_system_message = \
"Below are some instructions that describe some tasks. Write responses that appropriately complete each request.",
extra_eos_tokens = None,
):
"""
Creates a Ollama modelfile and a HF Jinja template from a custom
template. You must provide 2x examples of an input & output.
There is an optional system message as well.
You must use {INPUT}, {OUTPUT} twice, and {SYSTEM} is optional.
"""
# Strip only the left
chat_template = chat_template.lstrip()
assert(tokenizer is not None)
if extra_eos_tokens is None: extra_eos_tokens = []
elif type(extra_eos_tokens) is str: extra_eos_tokens = [extra_eos_tokens,]
vocab = tokenizer.get_vocab()
for extra_eos in extra_eos_tokens:
assert(type(extra_eos) is str)
if extra_eos not in vocab:
raise ValueError(f"Unsloth: `{extra_eos}` is not a singular token in the tokenizer.")
pass
pass
error_msg = \
"Unsloth: Your prompt template must have 2 examples showing the user input {INPUT} "\
"and the assistant output {OUTPUT}\n\n"\
"For example what is not allowed is just:\n"\
"### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n\n\n"\
"What is required is 2x of this:\n"\
"### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"\
"### Input:\\n{INPUT}\\n\\n### Response:\\n{OUTPUT}\\n"
# Check for EOS after {OUTPUT}
if tokenizer.eos_token is not None:
extra_eos_tokens.insert(0, tokenizer.eos_token)
if len(extra_eos_tokens) == 0:
raise RuntimeError(
"Unsloth: Your tokenizer does not have an EOS token? Please provide one via extra_eos_tokens!"
)
pass
# Check tokenizer types
tokenizer_name = tokenizer.name_or_path.lower()
if tokenizer_name.startswith(("unsloth/llama-3-8b-instruct", "unsloth/llama-3-70b-instruct")):
# Add <|eot_id|>
extra_eos_tokens.append("<|eot_id|>")
elif ("<|eot_id|>" in extra_eos_tokens or "<|eot_id|>" in chat_template) and \
tokenizer_name.startswith(("unsloth/llama-3-8b", "unsloth/llama-3-70b")):
# Warn
logger.warning(
"Unsloth: Base llama-3 models did not train <|eot_id|>.\n"\
"Please use the instruct version or use <|end_of_text|>"
)
pass
extra_eos_tokens = list(set(extra_eos_tokens))
count_eos = 0
for eos in extra_eos_tokens:
count_eos += len(re.findall(r"{OUTPUT}" + re.escape(eos), chat_template))
pass
# This forces you to provide 2 input and outputs
final_combined_check = False
try:
# O(N^2) search finding 2 repeatted pieces of text
j = len(chat_template)-1
at_least_one = False
while j > 0:
found = chat_template.rfind(chat_template[j:], 0, j)
if found == -1: break
j -= 1
at_least_one = True
pass
if j > 0: j += 1
else: raise RuntimeError(error_msg)
if not at_least_one: raise RuntimeError(error_msg)
# Must be equivalent to left
final_combined_check = True
# Repeatted text
instruction_response = chat_template[j:]
if instruction_response.count("{INPUT}") != 1 or instruction_response.count("{OUTPUT}") != 1:
raise RuntimeError(error_msg)
pass
# 1st System, Instruction, Output pair
left = chat_template[:j]
# 2nd Instruction, Output pair
right = chat_template[j:]
final_combined_check = left if final_combined_check else chat_template
# Isolate input
extra_eos_tokens_regex = "|".join(f"(?:{re.escape(x)})" for x in extra_eos_tokens)
if len(extra_eos_tokens_regex) != 0:
find_end = f"(?:{extra_eos_tokens_regex})?"
else:
find_end = ""
find_end = r"\{INPUT\}[\s\n]{0,}" + find_end
input_end = list(re.finditer(find_end, right))
assert(len(input_end) == 1)
input_end = input_end[0]
input_end = input_end.span(0)[1]
input_part = right[:input_end]
# Isolate output
output_part = right[input_end:]
# Isolate system
where_system = left.find(input_part)
system_part = left[:where_system if where_system != -1 else len(left)]
# Check if the user provided a correct prompt
combined = system_part + input_part + output_part
if combined != final_combined_check:
combined_changed = combined .replace('\n', '\\n')
left_changed = final_combined_check.replace('\n', '\\n')
raise RuntimeError(
"Unsloth: The prompt template you provided isn't correct. You gave:\n"\
f"{combined_changed}\n\n"\
"But we require the following:\n"\
f"{left_changed}"
)
pass
except:
ending = chat_template[chat_template.find("{OUTPUT}") + len("{OUTPUT}"):]
ending = re.escape(ending)
find_text = "{INPUT}" + ending + "(.+?{OUTPUT}" + ending + ")"
response_part = re.findall(find_text, chat_template, flags = re.DOTALL | re.MULTILINE)
response_part = response_part[0]
for j in range(1, len(response_part)):
try_find = re.escape(response_part[:j])
try: found = next(re.finditer("(" + try_find + ").+?\{INPUT\}", chat_template, flags = re.DOTALL | re.MULTILINE))
except: break
pass
separator = found.group(1)
response_start = chat_template.find(response_part)
start_instruction = chat_template[:response_start].rfind(separator)
if start_instruction == -1: start_instruction = 0
instruction_part = chat_template[start_instruction:response_start]
combined = instruction_part + response_part
where = chat_template.find(combined)
system_part = chat_template[:where]
system_part, input_part, output_part = system_part, instruction_part, response_part
pass
if count_eos == 0:
logger.warning("Unsloth: We automatically added an EOS token to stop endless generations.")
eos = extra_eos_tokens[0]
output_part = output_part + eos
pass
# Ollama modelfile parts
# Check bos_token is in system prompt
ollama_system = system_part
has_bos_token = False
always_bos_token = False
if tokenizer("A").input_ids[0] == getattr(tokenizer, "bos_token_id", None):
always_bos_token = True
if ollama_system.startswith(tokenizer.bos_token):
has_bos_token = True
ollama_system = ollama_system[len(tokenizer.bos_token):]
pass
pass
# Check system
if "{SYSTEM}" in ollama_system:
system_modelfile = "{{ if .System }}" + ollama_system.replace("{SYSTEM}", "{{ .System }}") + "{{ end }}"
else:
system_modelfile = ollama_system
pass
input_modelfile = "{{ if .Prompt }}" + input_part .replace("{INPUT}", "{{ .Prompt }}") + "{{ end }}"
output_modelfile = output_part.replace("{OUTPUT}", "{{ .Response }}")
# Ollama EOS
ollama_eos = get_ollama_eos_tokens(tokenizer, extra_eos_tokens)
ollama_eos = '\n'.join(f'PARAMETER stop "{eos}"' for eos in ollama_eos)
# Add temperature and min_p to counteract gibberish
ollama_eos += "\nPARAMETER temperature 1.5\nPARAMETER min_p 0.1"
# Ollama modelfile
part = '"""'
modelfile = 'FROM {__FILE_LOCATION__}\n\n'\
'TEMPLATE ' + part + system_modelfile + input_modelfile + output_modelfile + \
part + '\n\n' + ollama_eos
# HF Jinja Chat template
def process(part, which, content = "message['content']"):
if part.endswith(which):
part = "'" + part[:part.find(which)] + f"' + {content}"
elif part.startswith(which):
part = f"{content} + '" + part[part.find(which):] + "'"
else:
part = "'" + part.replace(which, f"' + {content} + '") + "'"
if part.startswith("'' + "): part = part[5:]
return part
pass
input_jinja = process(input_part, "{INPUT}")
output_jinja = process(output_part, "{OUTPUT}")
pass
jinja_template = \
"{% for message in loop_messages %}"\
"{% if message['role'] == 'user' %}"\
"{{ " + input_jinja + " }}"\
"{% elif message['role'] == 'assistant' %}"\
"{{ " + output_jinja + " }}"\
"{% else %}"\
"{{ raise_exception('Only user and assistant roles are supported!') }}"\
"{% endif %}"\
"{% endfor %}"\
"{% if add_generation_prompt %}"\
"{{ '" + output_part[:output_part.find("{OUTPUT}")] + "' }}"\
"{% endif %}"
pass
# Now add system prompt to jinja
if len(system_part) != 0:
partial_system = process(system_part, "{SYSTEM}", "messages[0]['content']")
partial_system = partial_system.replace("{SYSTEM}", "")
if "{SYSTEM}" in partial_system:
if default_system_message is None:
raise RuntimeError("Unsloth: Please specify a default system message!")
pass
# Separate the BOS
if has_bos_token:
partial_system = partial_system.replace(tokenizer.bos_token, "", 1)
system_part = system_part .replace(tokenizer.bos_token, "", 1)
pass
partial_system = \
"{% if messages[0]['role'] == 'system' %}"\
"{{ " + partial_system + " }}"\
"{% set loop_messages = messages[1:] %}"
if default_system_message is not None:
full_system = system_part.replace("{SYSTEM}", default_system_message)
if "{SYSTEM}" in system_part:
modelfile += '\nSYSTEM "' + default_system_message + '"'
pass
partial_system += "{% else %}"\
"{{ '" + full_system + "' }}"\
"{% set loop_messages = messages %}"\
"{% endif %}"
else:
partial_system += "{% endif %}"
pass
jinja_template = partial_system + jinja_template
if has_bos_token:
jinja_template = "{{ bos_token }}" + jinja_template
pass
# Fix missing loop_messages
if "{% set loop_messages = messages %}" not in jinja_template:
jinja_template = jinja_template.replace(
"{% for message in loop_messages %}",
"{% for message in messages %}",
1, # Only replace the first one
)
pass
# Check if system part is the same!
jinja_template = re.sub(
r"\{\% if messages\[0\]\['role'\] \=\= 'system' \%\}\{\{ '(.+?)' \}\}"\
r"\{\% set loop\_messages \= messages\[1\:\] \%\}"\
r"\{\% else \%\}\{\{ '\1' \}\}\{\% set loop\_messages \= messages \%\}\{\% endif \%\}"\
r"\{\% for message in loop\_messages \%\}",
r"{{ '\1' }}{% for message in messages %}",
jinja_template, flags = re.MULTILINE | re.DOTALL,
)
# Check jinja tempate for bos
if always_bos_token:
if not jinja_template.startswith(("{{ bos_token }}", "{{- bos_token }}")):
jinja_template = "{{ bos_token }}" + jinja_template
pass
# Get instruction and output parts for train_on_inputs = False
input_part = input_part [:input_part .find("{INPUT}")]
output_part = output_part[:output_part.find("{OUTPUT}")]
return modelfile, jinja_template, input_part, output_part
pass
def test_construct_chat_template():
token = "hf_"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token = token)
chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|>"""
default_system_message = \
"Below are some instructions that describe some tasks. Write responses that appropriately complete each request."
extra_eos_tokens = None
modelfile, jinja_template, _, _ = construct_chat_template(
tokenizer = tokenizer,
chat_template = chat_template,
extra_eos_tokens = extra_eos_tokens,
)
messages = [
{"role": "system", "content": "You are an assistant"},
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "It's 4."},
{"role": "user", "content": "Ok!"},
{"role": "assistant", "content": "Anything else?"},
{"role": "user", "content": "What's 2x2?"},
]
correct_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
tokenizer.chat_template = jinja_template
new_output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
assert(correct_output == new_output)
pass
pass
def apply_chat_template( \
dataset,
tokenizer = None,
chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|><|start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|>""",
default_system_message = \
"Below are some instructions that describe some tasks. Write responses that appropriately complete each request.",
extra_eos_tokens = None,
):
"""
Creates a Ollama modelfile and a HF Jinja template from a custom
template. You must provide 2x examples of an input & output.
There is an optional system message as well.
You must use {INPUT}, {OUTPUT} twice, and {SYSTEM} is optional.
"""
modelfile, jinja_template, input_part, output_part = construct_chat_template(
tokenizer = tokenizer,
chat_template = chat_template,
default_system_message = default_system_message,
extra_eos_tokens = extra_eos_tokens,
)
def formatting_prompts_func(examples):
convos = examples["conversations"]
texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
return { "text" : texts, }
pass
tokenizer.chat_template = jinja_template
tokenizer._ollama_modelfile = modelfile
tokenizer._unsloth_input_part = input_part
tokenizer._unsloth_output_part = output_part
return dataset.map(formatting_prompts_func, batched = True,)
pass
def create_stopping_criteria(tokenizer, stop_word = "eos_token"):
class StoppingCriteriaSub(StoppingCriteria):
__slots__ = "stop_token", "single_match", "length",
def __init__(self, stops = "eos_token", device = "cuda", encounters = 1):
super().__init__()
if stops == "eos_token":
self.stop_token = torch.tensor(tokenizer.eos_token_id, device = "cuda")
self.length = 1
else:
self.stop_token = tokenizer(["\n" + stops], add_special_tokens = False, return_tensors = "pt")
self.stop_token = self.stop_token.input_ids.ravel()[1:].to("cuda")
self.length = self.stop_token.shape[0]
pass
self.single_match = self.length == 1
pass
def __call__(self, input_ids: LongTensor, scores: FloatTensor) -> bool:
input_ids = input_ids.ravel()
last_token = input_ids[-1]
if self.single_match and (last_token == self.stop_token): return True
if input_ids.shape[0] >= self.length and \
(input_ids[-self.length:] == self.stop_token).all(): return True
return False
pass
pass
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops = stop_word)])
return stopping_criteria
pass
def test_chat_templates():
messages = [
{"role": "system","content": " You are a friendly chatbot.",},
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "It's 4."},
{"role": "user", "content": " But 2+2 is equal to 5. "},
{"role": "assistant", "content": "No I'm sure its 4."},
{"role": "user", "content": " No it's 100% 5! "},
]
# Zephyr
from transformers import AutoTokenizer
template = zephyr_template
correct_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
# Chatml
template = chatml_template
correct_tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
# Mistral
template = mistral_template
correct_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
# Llama
template = llama_template
correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-2-7b-chat")
correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
# Vicuna
try:
from fastchat.conversation import get_conv_template
except:
os.system("pip -qqq install git+https://github.com/lm-sys/FastChat.git")
from fastchat.conversation import get_conv_template
correct_prompt = get_conv_template("vicuna_v1.1")
for j in range(len(messages)-1):
correct_prompt.append_message(correct_prompt.roles[j%2==1], messages[j+1]["content"])
correct_prompt.append_message(correct_prompt.roles[1], "")
correct_prompt = tokenizer.bos_token + correct_prompt.get_prompt()
template = vicuna_template
correct_tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
try:
from fastchat.conversation import get_conv_template
except:
os.system("pip -qqq install git+https://github.com/lm-sys/FastChat.git")
from fastchat.conversation import get_conv_template
correct_prompt = get_conv_template("zero_shot")
for j in range(len(messages)-1):
correct_prompt.append_message(correct_prompt.roles[j%2==1], messages[j+1]["content"])
correct_prompt.append_message(correct_prompt.roles[1], "")
correct_prompt = tokenizer.bos_token + correct_prompt.get_prompt()
template = vicuna_old_template
correct_tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
# We add </s> ourselves
assert(correct_prompt == our_prompt.replace("</s>", ""))
# Gemma
correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-7b-it")
correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = gemma_template
our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
assert(our_prompt == correct_prompt)
# Llama-3
template = llama3_template
correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct")
correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
# Phi-3
template = phi3_template
correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
correct_tokenizer.chat_template = template
our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
assert(correct_prompt == our_prompt)
pass
def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf"):
"""
Carefully checks the output of GGUF's tokenization and HF.
Can catch all tokenization bugs.
"""
import subprocess
import re
messages = [
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "It's 4."},
{"role": "user", "content": " But 2+2 is equal to 5. "},
{"role": "assistant", "content": "No I'm sure its 4."},
{"role": "user", "content": " No it's 100% 5! "},
]
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}""".format(
"Describe the city given eloquently.", # instruction
"The lost city of Atlantis.", # input
"", # output - leave this blank for generation!
)
prompts = [ prompt, ]
if tokenizer.chat_template is not None:
prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
prompt = prompt.replace("'", "") # Subprocess does not like ''
prompt = remove_special_tokens(tokenizer, prompt)
prompts.append(prompt)
pass
for prompt in prompts:
command = f"./llama.cpp/llama-cli -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
f"--check-tensors -p '{prompt}'"
datas = []
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
for line in sp.stdout:
datas.append(line.decode("utf-8", errors = "replace"))
pass
gguf_tokens = "".join(datas)
# Now extract GGUF tokenization attempt
gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE)
gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized]
input_ids = tokenizer(prompt).input_ids
tokens = tokenizer.batch_decode(input_ids)
hf_tokenized = list(zip(input_ids, tokens))
# Compare to Huggingface
for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)):
if (hf_token[0] != gguf_token[0]):
print("Failed GGUF != HF at", j)
print("HF =", hf_token)
print("GGUF =", gguf_token)
print(hf_tokenized)
print()
print(gguf_tokenized)
print()
raise RuntimeError("Failed comparing GGUF to HF.")
pass
pass
return True
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .cross_entropy_loss import (
fast_cross_entropy_loss,
patch_llama_for_causal_lm,
unpatch_llama_for_causal_lm,
)
from .rms_layernorm import (
fast_rms_layernorm,
patch_rms_layernorm,
unpatch_rms_layernorm,
)
from .layernorm import (
fast_layernorm,
patch_layernorm,
unpatch_layernorm,
)
from .rope_embedding import fast_rope_embedding, inplace_rope_embedding
from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
from .geglu import (
geglu_exact_forward_kernel,
geglu_exact_backward_kernel,
geglu_approx_forward_kernel,
geglu_approx_backward_kernel,
)
from .fast_lora import (
get_lora_parameters,
get_lora_parameters_bias,
apply_lora_mlp_swiglu,
apply_lora_mlp_geglu_exact,
apply_lora_mlp_geglu_approx,
apply_lora_qkv,
apply_lora_o,
)
from .utils import fast_dequantize, fast_gemv, QUANT_STATE, fast_linear_forward, matmul_lora
from .flex_attention import (
HAS_FLEX_ATTENTION,
slow_attention_softcapping,
slow_inference_attention_softcapping,
create_flex_attention_causal_mask,
create_flex_attention_sliding_window_mask,
)
try:
print("🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.")
except:
print("Unsloth: Will patch your computer to enable 2x faster free finetuning.")
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton
import triton.language as tl
import torch
from .utils import calculate_settings, MAX_FUSED_SIZE, triton_tanh
from transformers.models.llama.modeling_llama import logger
@triton.heuristics({
"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING" ],
"DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
})
@triton.jit
def _cross_entropy_forward(
logits_ptr, logits_row_stride,
loss_ptr,
logsumexp_ptr,
labels_ptr,
VOCAB_SIZE : tl.constexpr,
BLOCK_SIZE : tl.constexpr,
DO_SOFTCAPPING : tl.constexpr,
SOFTCAP : tl.constexpr,
DO_LOGIT_SCALING: tl.constexpr,
LOGIT_SCALE : tl.constexpr,
):
"""
Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
Pi = exp(xi) / sum(exp(xi))
CE_i = -y log(p) = -y log[ exp(x) / sum(exp(x)) ]
= -y [ x - log[sum(exp(x))] ]
= y * (log[sum(exp(x))] - x)
If y == 0: CE_i = 0
If y == 1: CE_i = logsumexp - x
logsumexp is also stable
Take y = log[sum(exp(x))]
exp(y) = sum(exp(x))
exp(y) = sum(exp(x - c)*exp(c)) Since e^(x-c)*e^c = e^x
exp(y) = exp(c)*sum(exp(x - c))
y = log(exp(c)*sum(exp(x - c)))
y = c + log[sum(exp(x - c))]
This means we can set c = max(x) to make sure
exp(x - c) always is exp(x - max(x)).
This ensures exp(x - max(x))'s maximum is 1 as exp(0) = 1.
"""
row_idx = tl.program_id(0)
logits_ptr += row_idx * logits_row_stride.to(tl.int64)
loss_ptr += row_idx
logsumexp_ptr += row_idx
labels_ptr += row_idx
col_offsets = tl.arange(0, BLOCK_SIZE)
mask = col_offsets < VOCAB_SIZE
label_idx = tl.load(labels_ptr).to(tl.int32)
logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
# Go logit scaling for Cohere: t * x
if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
# Do logit softcapping for Gemma 2: t * tanh(1/t * x)
if DO_SOFTCAPPING: logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
logits = logits.to(tl.float32)
c = tl.max(logits, 0)
logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
if label_idx != -100:
x = tl.load(logits_ptr + label_idx)
# Go logit scaling for Cohere: t * x
if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
# Do logit softcapping for Gemma 2: t * tanh(1/t * x)
if DO_SOFTCAPPING: x = SOFTCAP * triton_tanh(x / SOFTCAP)
loss = logsumexp - x.to(tl.float32)
else:
loss = 0.0
tl.store(logsumexp_ptr, logsumexp)
tl.store(loss_ptr, loss)
pass
@triton.heuristics({
"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING" ],
"DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
})
@triton.jit
def _chunked_cross_entropy_forward(
logits_ptr, logits_row_stride,
loss_ptr,
logsumexp_ptr,
labels_ptr,
VOCAB_SIZE : tl.constexpr,
N_CHUNKS : tl.constexpr,
BLOCK_SIZE : tl.constexpr,
DO_SOFTCAPPING : tl.constexpr,
SOFTCAP : tl.constexpr,
DO_LOGIT_SCALING: tl.constexpr,
LOGIT_SCALE : tl.constexpr,
):
"""
256K vocab divided in 4 chunks
|-65536-| |-65536-| |-65536-| |-65536-|
|-------| |-------| |-------| |-------|
|-------| |-------| |-------| |-------|
If y == 0: CE_i = 0
If y == 1: CE_i = logsumexp - x
Notice we can do logsumexp for each chunk and then
logsumexp[chunk_sum(logsumexp)] == logsumexp
chunk_sum = log[chunk_sum(logsumexp)]
= log[exp(logsumexp(a)) + ... + exp(logsumexp(z))]
= log[exp(log[sum(exp(a))]) + ... + exp(log[sum(exp(z))])]
= log[sum(exp(a)) + ... + sum(exp(z))]
= logsumexp(x)
This means we can perform a logsumexp for each chunk, then do a
final logsumexp reduction!
Ie do: logsumexp(chunked_logsumexp) - x
"""
row_idx = tl.program_id(0)
chunk_idx = tl.program_id(1)
logits_ptr += row_idx * logits_row_stride.to(tl.int64)
loss_ptr += row_idx
logsumexp_ptr += row_idx * N_CHUNKS + chunk_idx
labels_ptr += row_idx
col_offsets = chunk_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = col_offsets < VOCAB_SIZE
label_idx = tl.load(labels_ptr).to(tl.int32)
logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
# Go logit scaling for Cohere: t * x
if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
# Do logit softcapping for Gemma 2: t * tanh(1/t * x)
if DO_SOFTCAPPING: logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
logits = logits.to(tl.float32)
c = tl.max(logits, 0)
logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
if chunk_idx == 0:
# logsumexp(chunked_logsumexp) - x
# Do the -x separately
if label_idx != -100:
x = tl.load(logits_ptr + label_idx).to(tl.float32)
# Go logit scaling for Cohere: t * x
if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
# Do logit softcapping for Gemma 2: t * tanh(1/t * x)
if DO_SOFTCAPPING: x = SOFTCAP * triton_tanh(x / SOFTCAP)
loss = -1.0 * x.to(tl.float32)
else:
loss = 0.0
tl.store(loss_ptr, loss)
pass
tl.store(logsumexp_ptr, logsumexp)
pass
@triton.heuristics({
"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING" ],
"DO_LOGIT_SCALING": lambda args: args["DO_LOGIT_SCALING"],
})
@triton.jit
def _cross_entropy_backward(
logits_ptr, logits_row_stride,
dloss_ptr, dloss_row_stride,
logsumexp_ptr,
labels_ptr,
VOCAB_SIZE : tl.constexpr,
BLOCK_SIZE : tl.constexpr,
DO_SOFTCAPPING : tl.constexpr,
SOFTCAP : tl.constexpr,
DO_LOGIT_SCALING: tl.constexpr,
LOGIT_SCALE : tl.constexpr,
):
"""
CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
dC/dx = d/dx (y * log[sum(exp(x))] - x * y)
From https://en.wikipedia.org/wiki/LogSumExp
d/dx logsumexp = exp(x) / sum(exp(x)) = softmax(x)
dC/dx = y * exp(x) / sum(exp(x)) - d/dx (x * y)
dC/dx = y * exp[ log[exp(x) / sum(exp(x))] ] using x = exp(log(x)) trick
dC/dx = y * exp[x - logsumexp] - d/dx (x * y)
If y == 0: dC/dx = 0
If y == 1 and x == label: dC/dlabel = exp[x - logsumexp] - 1
If y == 1 and x != label: dC/dx = exp[x - logsumexp]
"""
row_idx = tl.program_id(0)
block_idx = tl.program_id(1)
logits_ptr += row_idx * logits_row_stride.to(tl.int64)
dloss_ptr += row_idx * dloss_row_stride
col_offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = col_offsets < VOCAB_SIZE
label_idx = tl.load(labels_ptr + row_idx).to(tl.int32)
if label_idx != -100:
dloss = tl.load(dloss_ptr)
else:
dloss = 0.0
x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
# Do logit scaling for Cohere
if DO_LOGIT_SCALING:
# d/dx [s * x] = s
x = x * LOGIT_SCALE
pass
# Do logit softcapping for Gemma 2: t * tanh(1/t * x)
if DO_SOFTCAPPING:
# d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
partial = triton_tanh(x / SOFTCAP)
x = SOFTCAP * partial
pass
logsumexp = tl.load(logsumexp_ptr + row_idx)
y = tl.exp(x.to(tl.float32) - logsumexp)
y = tl.where(
col_offsets == label_idx,
y - 1.0, # exp(x - logsumexp) - 1
y, # exp(x - logsumexp)
)
if DO_LOGIT_SCALING:
# d/dx [s * x] = s
y = y * LOGIT_SCALE
pass
if DO_SOFTCAPPING:
# d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
y = y * (1.0 - partial*partial)
pass
# If y == 0: dC/dx = 0 ==> we already masked it to be = 0, so dloss = 0.
tl.store(logits_ptr + col_offsets, dloss * y, mask = mask)
pass
MAX_FUSED_SIZE = 16384 #65536 # 2**16
class Fast_CrossEntropyLoss(torch.autograd.Function):
@staticmethod
def forward(ctx, logits, labels, logit_softcapping = 0, logit_scaling = 0):
n_rows, vocab_size = logits.shape
div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
n_chunks = div + (mod != 0)
#losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
losses = torch.empty(n_rows, dtype = torch.float32, device = logits.device)
DO_SOFTCAPPING = (logit_softcapping != 0)
DO_LOGIT_SCALING = (logit_scaling != 0)
if n_chunks == 1:
# For small vocabs <= 65336 like Llama, Mistral
BLOCK_SIZE, num_warps = calculate_settings(vocab_size)
#logsumexp = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
logsumexp = torch.empty(n_rows, dtype = torch.float32, device = logits.device)
_cross_entropy_forward[(n_rows,)](
logits, logits.stride(0),
losses,
logsumexp,
labels,
VOCAB_SIZE = vocab_size,
BLOCK_SIZE = BLOCK_SIZE,
DO_SOFTCAPPING = DO_SOFTCAPPING,
SOFTCAP = logit_softcapping,
DO_LOGIT_SCALING = DO_LOGIT_SCALING,
LOGIT_SCALE = logit_scaling,
num_warps = num_warps,
)
else:
# For large vocabs > 65336 like Gemma 256K
#logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = "cuda:0")
logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = logits.device)
_chunked_cross_entropy_forward[(n_rows, n_chunks,)](
logits, logits.stride(0),
losses,
logsumexp,
labels,
VOCAB_SIZE = vocab_size,
N_CHUNKS = n_chunks,
BLOCK_SIZE = MAX_FUSED_SIZE,
DO_SOFTCAPPING = DO_SOFTCAPPING,
SOFTCAP = logit_softcapping,
DO_LOGIT_SCALING = DO_LOGIT_SCALING,
LOGIT_SCALE = logit_scaling,
num_warps = 8,
)
# logsumexp(chunked_logsumexp) - x
# Do the -x separately
logsumexp = torch.logsumexp(logsumexp, dim = 1) # Row sum
losses += logsumexp
losses.masked_fill_(labels == -100, 0) # Don't forget to mask padding out!
pass
ctx.save_for_backward(logits, logsumexp, labels)
ctx.DO_SOFTCAPPING = DO_SOFTCAPPING
ctx.logit_softcapping = logit_softcapping
ctx.DO_LOGIT_SCALING = DO_LOGIT_SCALING
ctx.logit_scaling = logit_scaling
return losses
pass
@staticmethod
def backward(ctx, dlosses):
logits, logsumexp, labels = ctx.saved_tensors
n_rows, vocab_size = logits.shape
BLOCK_SIZE = 4096
div, mod = divmod(vocab_size, BLOCK_SIZE)
n_blocks = div + (mod != 0)
_cross_entropy_backward[(n_rows, n_blocks,)](
logits, logits.stride(0),
dlosses, dlosses.stride(0),
logsumexp,
labels,
VOCAB_SIZE = vocab_size,
BLOCK_SIZE = BLOCK_SIZE,
DO_SOFTCAPPING = ctx.DO_SOFTCAPPING,
SOFTCAP = ctx.logit_softcapping,
DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
LOGIT_SCALE = ctx.logit_scaling,
num_warps = 8,
)
return logits, None, None, None,
pass
pass
@torch._disable_dynamo
def fast_cross_entropy_loss(
logits,
labels,
logit_softcapping = 0,
logit_scaling = 0,
n_items = None,
):
"""
Arguments:
logits: (batch, seq_len, vocab_size)
labels: (batch, seq_len,)
Returns:
losses: float
"""
batch, seq_len, d = logits.shape
assert(labels.shape == (batch, seq_len))
loss = Fast_CrossEntropyLoss.apply(
logits.view(batch*seq_len, d),
labels.view(-1),
logit_softcapping,
logit_scaling,
)
if n_items is None:
n_items = torch.count_nonzero(labels != -100)
return loss.sum() / n_items
pass
from transformers.models.llama.modeling_llama import (
LlamaForCausalLM,
CausalLMOutputWithPast,
Optional,
Union,
Cache,
List,
Tuple,
)
# Transformers 4.47 need Unpack, KwargsForCausalLM
try:
from transformers.models.llama.modeling_llama import Unpack, KwargsForCausalLM
except:
pass
pass
import inspect, re
function = inspect.getsource(LlamaForCausalLM.forward)
function = function.split("\n")
i = re.match(r"[ ]{1,}", function[0]).span(0)[1]
function = [x[i:] for x in function]
function = "\n".join(function)
function = function[function.find("def forward"):]
replacement = """ loss = None
logit_softcapping = getattr(self.config, "final_logit_softcapping", 0)
logit_scaling = getattr(self.config, "logit_scale", 0)
if labels is not None:
shift_logits = logits
if not hasattr(self, "extra_ignored_labels"):
# Fixes https://github.com/unslothai/unsloth/issues/10
self.extra_ignored_labels = torch.full((self.max_seq_length, 1), -100, device = "cuda:0")
pass
shift_labels = torch.hstack((labels[..., 1:], self.extra_ignored_labels[:labels.shape[0]]))
loss = fast_cross_entropy_loss(
logits = shift_logits,
labels = shift_labels,
logit_softcapping = logit_softcapping,
logit_scaling = logit_scaling,
n_items = kwargs.get("num_items_in_batch", None) or kwargs.get("n_items", None),
)
else:
if logit_scaling != 0:
if logits.requires_grad:
logits = logit_scaling * logits
else:
logits *= logit_scaling
pass
pass
if logit_softcapping != 0:
if logits.requires_grad:
logits = (1.0 / logit_softcapping) * logits
logits = torch.tanh(logits)
logits = logit_softcapping * logits
else:
logits *= (1.0 / logit_softcapping)
torch.tanh(logits, out = logits)
logits *= logit_softcapping
pass
pass
pass
"""
function = \
function[:function.find(" loss = None")] + \
replacement + \
function[ function.find(" if not return_dict"):]
function = function.replace("logits = logits.float()", "\n")
# Missed spaces
function = function.split("\n")
# Not the first one though!
function = [function[0]] + [" "*4 + x for x in function[1:]]
function = "\n".join(function)
function = f"class Unsloth_LlamaForCausalLM(LlamaForCausalLM):\n"\
f" {function}\n"
exec(function, globals())
del function, replacement, inspect, re
def patch_llama_for_causal_lm():
import transformers.models.llama.modeling_llama
transformers.models.llama.modeling_llama.LlamaForCausalLM = Unsloth_LlamaForCausalLM
return
pass
def unpatch_llama_for_causal_lm():
import transformers.models.llama.modeling_llama
transformers.models.llama.modeling_llama.LlamaForCausalLM = LlamaForCausalLM
return
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from .utils import (
fast_dequantize,
QUANT_STATE,
get_lora_parameters,
get_lora_parameters_bias,
matmul_lora,
torch_amp_custom_fwd,
torch_amp_custom_bwd,
)
class LoRA_MLP(torch.autograd.Function):
"""
### LoRA weights
G = G + Ag @ Bg
U = U + Au @ Bu
W = W + Aw @ Bw
### SwiGLU(X)
e = X @ G
f = e * sigmoid(e)
g = X @ U
h = f * g
i = h @ W
### Backpropagation chain rule
See our blog post for more details
df = sigmoid(e) * (1 - f) + f
dC/dW = h.T @ dY
dC/dU = X.T @ (D @ W.T * f)
dC/dG = X.T @ (D @ W.T * df * g)
### Down projection LoRA weights
dC/dAw = dC/dW @ B.T
dC/dBw = A.T @ dC/dW
dC/dAw = h.T @ dY @ B.T
dC/dBw = A.T @ h.T @ dY
### Up projection LoRA weights
dC/dAu = X.T @ (D @ W.T * f) @ B.T
dC/dBu = A.T @ X.T @ (D @ W.T * f)
### Gate projection LoRA weights
dC/dAg = X.T @ (D @ W.T * df * g) @ B.T
dC/dBg = A.T @ X.T @ (D @ W.T * df * g)
Don't forget to see our blog post for more details!
"""
@staticmethod
@torch_amp_custom_fwd
def forward(ctx, X : torch.Tensor,
gateW, gateW_quant, gateA, gateB, gateS,
upW, upW_quant, upA, upB, upS,
downW, downW_quant, downA, downB, downS,
_forward_function, _backward_function,
inplace = True,):
dtype = X.dtype
e = matmul_lora(X, gateW, gateW_quant, gateA, gateB, gateS)
g = matmul_lora(X, upW, upW_quant, upA, upB, upS)
h = _forward_function(e, g)
i = matmul_lora(h, downW, downW_quant, downA, downB, downS)
ctx.custom_saved_tensors = (
gateW, gateW_quant, gateS,
upW, upW_quant, upS,
downW, downW_quant, downS,
_backward_function,
)
ctx.save_for_backward(gateA, gateB, upA, upB, downA, downB,
X, e, g)
ctx.inplace = inplace
return i
pass
@staticmethod
@torch_amp_custom_bwd
def backward(ctx, dY : torch.Tensor):
gateW, gateW_quant, gateS, upW, upW_quant, upS, downW, downW_quant, downS, \
_backward_function = ctx.custom_saved_tensors
gateA, gateB, upA, upB, downA, downB, \
X, e, g = ctx.saved_tensors
gateA, gateB, upA, upB, downA, downB = \
gateA.t(), gateB.t(), upA.t(), upB.t(), downA.t(), downB.t()
batch, seq_len, hd = X.shape
dY = dY.view(-1, dY.shape[-1])
X = X .view(-1, X .shape[-1])
e = e .view(-1, e .shape[-1])
g = g .view(-1, g .shape[-1])
dtype = X.dtype
DW = matmul_lora(dY, downW.t(), downW_quant, downB, downA, downS)
DW, e, g = _backward_function(DW, e, g)
h, df, de = DW, e, g
# Down projection LoRA weights
d_downA = h.t() @ (dY @ downB.t())
d_downB = (downA.t() @ h.t()) @ dY
d_downA *= downS
d_downB *= downS
# Up projection LoRA weights
d_upA = X.t() @ (df @ upB.t())
d_upB = (upA.t() @ X.t()) @ df
d_upA *= upS
d_upB *= upS
# Gate projection LoRA weights
d_gateA = X.t() @ (de @ gateB.t())
d_gateB = (gateA.t() @ X.t()) @ de
d_gateA *= gateS
d_gateB *= gateS
# dX = matmul_lora(df, upW.t(), upW_quant, upB, upA, upS)
# dX += matmul_lora(de, gateW.t(), gateW_quant, gateB, gateA, gateS)
upW = fast_dequantize(upW.t(), upW_quant)
dX = torch.matmul(df, upW.t(), out = X if ctx.inplace else None)
del upW
dX += df @ upB.to(dtype).t() @ (upS * upA.to(dtype).t())
gateW = fast_dequantize(gateW.t(), gateW_quant)
dX += de @ gateW.t()
del gateW
dX += de @ gateB.to(dtype).t() @ (gateS * gateA.to(dtype).t())
# gateW, gateW_quant, gateA, gateB, gateS,
# upW, upW_quant, upA, upB, upS,
# downW, downW_quant, downA, downB, downS,
return dX.view(batch, seq_len, hd), \
None, None, d_gateA.t(), d_gateB.t(), None, \
None, None, d_upA.t(), d_upB.t(), None, \
None, None, d_downA.t(), d_downB.t(), None, \
None, None, None, # _backward and _forward and inplace
pass
pass
from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
def apply_lora_mlp_swiglu(self, X, inplace = True):
gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
upW, upW_quant, upA, upB, upS = get_lora_parameters(self. up_proj)
downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
out = LoRA_MLP.apply(X,
gateW, gateW_quant, gateA, gateB, gateS,
upW, upW_quant, upA, upB, upS,
downW, downW_quant, downA, downB, downS,
swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel,
inplace,)
return out
pass
from .geglu import geglu_exact_forward_kernel, geglu_exact_backward_kernel
def apply_lora_mlp_geglu_exact(self, X, inplace = True):
gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
upW, upW_quant, upA, upB, upS = get_lora_parameters(self. up_proj)
downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
out = LoRA_MLP.apply(X,
gateW, gateW_quant, gateA, gateB, gateS,
upW, upW_quant, upA, upB, upS,
downW, downW_quant, downA, downB, downS,
geglu_exact_forward_kernel, geglu_exact_backward_kernel,
inplace,)
return out
pass
from .geglu import geglu_approx_forward_kernel, geglu_approx_backward_kernel
def apply_lora_mlp_geglu_approx(self, X):
gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
upW, upW_quant, upA, upB, upS = get_lora_parameters(self. up_proj)
downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
out = LoRA_MLP.apply(X,
gateW, gateW_quant, gateA, gateB, gateS,
upW, upW_quant, upA, upB, upS,
downW, downW_quant, downA, downB, downS,
geglu_approx_forward_kernel, geglu_approx_backward_kernel,)
return out
pass
class LoRA_QKV(torch.autograd.Function):
"""
### LoRA weights
Wq = Wq + Aq @ Bq
Wk = Wk + Ak @ Bk
Wv = Wv + Av @ Bv
Q = X @ Wq = X @ Wq + X @ Aq @ Bq
K = X @ Wk = X @ Wk + X @ Ak @ Bk
V = X @ Wv = X @ Wv + X @ Av @ Bv
### Backpropagation chain rule
See our blogpost for more details.
dC/dWq = X.T @ D(Wq)
dC/dWk = X.T @ D(Wk)
dC/dWv = X.T @ D(Wv)
We then sum them all find dC/dX
### Q projection LoRA weights
dC/dAq = X.T @ D(Wq) @ B.T
dC/dBq = A.T @ X.T @ D(Wq)
### K projection LoRA weights
dC/dAk = X.T @ D(Wk) @ B.T
dC/dBk = A.T @ X.T @ D(Wk)
### V projection LoRA weights
dC/dAv = X.T @ D(Wv) @ B.T
dC/dBv = A.T @ X.T @ D(Wv)
"""
@staticmethod
@torch_amp_custom_fwd
def forward(ctx, X : torch.Tensor,
QW, QW_quant, QA, QB, QS,
KW, KW_quant, KA, KB, KS,
VW, VW_quant, VA, VB, VS,
inplace = True):
dtype = X.dtype
Q = matmul_lora(X, QW, QW_quant, QA, QB, QS)
K = matmul_lora(X, KW, KW_quant, KA, KB, KS)
V = matmul_lora(X, VW, VW_quant, VA, VB, VS)
ctx.custom_saved_tensors = (
QW, QW_quant, QS,
KW, KW_quant, KS,
VW, VW_quant, VS,
)
ctx.save_for_backward(X, QA, QB, KA, KB, VA, VB,)
ctx.inplace = inplace
return Q, K, V
pass
@staticmethod
@torch_amp_custom_bwd
def backward(ctx, dQ, dK, dV):
QW, QW_quant, QS, KW, KW_quant, KS, VW, VW_quant, VS = \
ctx.custom_saved_tensors
X, QA, QB, KA, KB, VA, VB, = ctx.saved_tensors
QA, QB, KA, KB, VA, VB = \
QA.t(), QB.t(), KA.t(), KB.t(), VA.t(), VB.t()
batch, seq_len, hd = X.shape
dQ = dQ.view(-1, dQ.shape[-1])
dK = dK.reshape(-1, dK.shape[-1]) # view doesn't work on K.T
dV = dV.view(-1, dV.shape[-1])
X = X .view(-1, X .shape[-1])
dtype = X.dtype
### Weight projection LoRA weights
# See our blogpost for more details.
# Q Projection
d_QA = X.t() @ (dQ @ QB.t())
d_QB = (QA.t() @ X.t()) @ dQ
d_QA *= QS
d_QB *= QS
# K Projection
d_KA = X.t() @ (dK @ KB.t())
d_KB = (KA.t() @ X.t()) @ dK
d_KA *= KS
d_KB *= KS
# V Projection
d_VA = X.t() @ (dV @ VB.t())
d_VB = (VA.t() @ X.t()) @ dV
d_VA *= VS
d_VB *= VS
# Combine derivatives to find dX
# dQ
QW = fast_dequantize(QW.t(), QW_quant)
dX = torch.matmul(dQ, QW.t(), out = X if ctx.inplace else None)
del QW
dX += (dQ @ QB.to(dtype).t() @ (QS * QA.to(dtype).t()))
# dK
KW = fast_dequantize(KW.t(), KW_quant)
dX += dK @ KW.t()
del KW
dX += dK @ KB.to(dtype).t() @ (KS * KA.to(dtype).t())
# dV
VW = fast_dequantize(VW.t(), VW_quant)
dX += dV @ VW.t()
del VW
dX += dV @ VB.to(dtype).t() @ (VS * VA.to(dtype).t())
# QW, QW_quant, QA, QB, QS,
# KW, KW_quant, KA, KB, KS,
# VW, VW_quant, VA, VB, VS,
return dX.view(batch, seq_len, hd), \
None, None, d_QA.t(), d_QB.t(), None, \
None, None, d_KA.t(), d_KB.t(), None, \
None, None, d_VA.t(), d_VB.t(), None, \
None,
pass
pass
def apply_lora_qkv(self, X, inplace = True):
QW, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
Q, K, V = LoRA_QKV.apply(X,
QW, QW_quant, QA, QB, QS,
KW, KW_quant, KA, KB, KS,
VW, VW_quant, VA, VB, VS,
inplace,
)
return Q, K, V
pass
class LoRA_W(torch.autograd.Function):
"""
### LoRA weights
Wq = Wq + Aq @ Bq
Wk = Wk + Ak @ Bk
Wv = Wv + Av @ Bv
Q = X @ Wq = X @ Wq + X @ Aq @ Bq
K = X @ Wk = X @ Wk + X @ Ak @ Bk
V = X @ Wv = X @ Wv + X @ Av @ Bv
### Backpropagation chain rule
dC/dWq = X.T @ D(Wq)
dC/dWk = X.T @ D(Wk)
dC/dWv = X.T @ D(Wv)
### Q projection LoRA weights
dC/dAq = X.T @ D(Wq) @ B.T
dC/dBq = A.T @ X.T @ D(Wq)
### K projection LoRA weights
dC/dAk = X.T @ D(Wk) @ B.T
dC/dBk = A.T @ X.T @ D(Wk)
### V projection LoRA weights
dC/dAv = X.T @ D(Wv) @ B.T
dC/dBv = A.T @ X.T @ D(Wv)
"""
@staticmethod
@torch_amp_custom_fwd
def forward(ctx, X : torch.Tensor,
W, W_quant, A, B, S):
dtype = X.dtype
XW = matmul_lora(X, W, W_quant, A, B, S)
ctx.custom_saved_tensors = (W, W_quant, S,)
ctx.save_for_backward(A, B, X)
return XW
pass
@staticmethod
@torch_amp_custom_bwd
def backward(ctx, dY : torch.Tensor):
W, W_quant, S = ctx.custom_saved_tensors
A, B, X = ctx.saved_tensors
A, B = A.t(), B.t()
batch, seq_len, hd = X.shape
dY = dY.reshape(-1, dY.shape[-1]) # Must be reshape
X = X .reshape(-1, X .shape[-1]) # Must be reshape
dtype = X.dtype
### Weight projection LoRA weights
# Weight projection
d_A = X.t() @ (dY @ B.t())
d_B = (A.t() @ X.t()) @ dY
d_A *= S
d_B *= S
# Get derivative for dX
W = fast_dequantize(W.t(), W_quant)
dX = dY @ W.t()
del W
dX += dY @ B.to(dtype).t() @ (S * A.to(dtype).t())
# W, W_quant, A, B, S
return dX.view(batch, seq_len, hd), \
None, None, d_A.t(), d_B.t(), None
pass
pass
def apply_lora_o(self, X):
OW, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS)
return O
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from functools import lru_cache
from transformers.models.llama.modeling_llama import logger
torch_compile_options = {
"epilogue_fusion" : True,
"max_autotune" : True,
"shape_padding" : True,
"trace.enabled" : False, # Output Triton kernel outputs!
"triton.cudagraphs" : False,
}
# Flex Attention supported from torch 2.5 onwards only
try:
from torch.nn.attention.flex_attention import (
flex_attention as _flex_attention,
create_block_mask as _create_block_mask,
)
_flex_attention = torch.compile(_flex_attention, dynamic = True, options = torch_compile_options)
HAS_FLEX_ATTENTION = False
except:
HAS_FLEX_ATTENTION = False
pass
if not HAS_FLEX_ATTENTION:
# Logit softcapping
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
n_heads = self.num_heads
head_dim = self.head_dim
n_kv_heads = self.num_key_value_heads
n_groups = self.num_key_value_groups
# Grouped query attention
K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
K = K.reshape(bsz, n_heads, q_len, head_dim)
V = V.reshape(bsz, n_heads, q_len, head_dim)
# See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
# Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
# We default to using the config file itself
# s = self.config.hidden_size // self.config.num_attention_heads
s = self.config.query_pre_attn_scalar
t = self.config.attn_logit_softcapping
Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
A = torch.matmul(Q, K.transpose(2, 3))
A = t * torch.tanh(A / t) # Logit softcapping
A += causal_mask[:q_len, :q_len]
# Much slower in torch compile!
# A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
A = torch.matmul(A, V)
A = A.transpose(1, 2).contiguous()
A = A.reshape(bsz, q_len, n_heads*head_dim)
return A
pass
create_flex_attention_causal_mask = None
create_flex_attention_sliding_window_mask = None
else:
# See https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
# for more examples
# BSD 3-Clause License Copyright (c) 2023, Driss Guessous, Horace He et al
import functools, math
def generate_tanh_softcap(t):
def tanh_softcap(x, b, h, q_idx, kv_idx):
return t * torch.tanh(x / t)
return tanh_softcap
pass
def causal_masker(b, h, q_idx, kv_idx):
return q_idx >= kv_idx
pass
@functools.lru_cache
def sliding_window_masker(size = 4096):
def sliding_window(b, h, q_idx, kv_idx):
causal_mask = q_idx >= kv_idx
window_mask = q_idx - kv_idx <= size
return causal_mask & window_mask
return sliding_window
pass
@functools.lru_cache
def create_block_mask(mask, n = 128):
return _create_block_mask(
mask, 1, 1, n, n,
BLOCK_SIZE = 128,
_compile = True,
)
pass
def create_flex_attention_causal_mask(max_seq_length = 8192):
causal_mask = create_block_mask(causal_masker, max_seq_length)
return causal_mask
pass
def create_flex_attention_sliding_window_mask(max_seq_length = 8192, sliding_window = 4096):
sliding_masker = sliding_window_masker(sliding_window)
causal_mask = create_block_mask(sliding_masker, max_seq_length)
return causal_mask
pass
@functools.lru_cache
def flex_attention(s, t):
scale = 1.0 / math.sqrt(s)
score_mod = generate_tanh_softcap(t)
return functools.partial(
_flex_attention, score_mod = score_mod, scale = scale, enable_gqa = True,
)
pass
def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
n_heads = self.num_heads
head_dim = self.head_dim
s = self.config.query_pre_attn_scalar
t = self.config.attn_logit_softcapping
fx = flex_attention(s, t)
A = fx(query = Q, key = K, value = V, block_mask = causal_mask)
A = A.transpose(1, 2).contiguous()
A = A.reshape(bsz, q_len, n_heads*head_dim)
return A
pass
pass
torch_matmul = torch.matmul
torch_tanh = torch.tanh
torch_nn_functional_softmax = torch.nn.functional.softmax
def slow_inference_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
n_heads = self.num_heads
head_dim = self.head_dim
n_kv_heads = self.num_key_value_heads
n_groups = self.num_key_value_groups
# Grouped query attention
K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
K = K.reshape(bsz, n_heads, q_len, head_dim)
V = V.reshape(bsz, n_heads, q_len, head_dim)
# See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
# Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
# We default to using the config file itself
# s = self.config.hidden_size // self.config.num_attention_heads
s = self.config.query_pre_attn_scalar
t = self.config.attn_logit_softcapping
Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
A = torch_matmul(Q, K.transpose(2, 3))
# Logit softcapping
A /= t; torch_tanh(A, out = A); A *= t;
A += causal_mask[:q_len, :q_len]
# Much slower in torch compile!
# A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
A = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
A = torch_matmul(A, V)
A = A.transpose(1, 2).contiguous()
A = A.reshape(bsz, q_len, n_heads*head_dim)
return A
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton
import triton.language as tl
import torch
from .utils import calculate_settings, triton_tanh
@triton.jit
def _exact_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
block_idx = tl.program_id(0)
offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
# f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
# h = f * up
e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
f_row = 0.5 * e_row * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
f_row = f_row.to(g_row.dtype) # Exact copy from HF
h_row = f_row * g_row
# Store h
tl.store(h + offsets, h_row, mask = mask)
pass
def geglu_exact_forward_kernel(gate, up):
batch, seq_len, hd = gate.shape
n_elements = gate.numel()
out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
_exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
return out
pass
@triton.jit
def _exact_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
"""
f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
h = f * up
df/de (with help of Wolfram :)
df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
Reuse via
f = 1/2 * (1 + erf(1/sqrt(2) * e)) * e
"""
block_idx = tl.program_id(0)
offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
# Break e_row away for re-use
# f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
f_partial_row = 0.5 * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
f_row = f_partial_row * e_row
f_row = f_row.to(DW_row.dtype)
# h = f * g
h_row = f_row * g_row
# df = DW * f
df_row = DW_row * f_row
# dg = DW * g
dg_row = DW_row * g_row
# df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
t = 0.3989422804014327 # 1/sqrt(2*pi)
df_de = f_partial_row + t * e_row * tl.exp(-0.5 * e_row * e_row)
de_row = dg_row.to(tl.float32) * df_de
de_row = de_row.to(DW_row.dtype)
# Store derivatives in buffers
tl.store(DW + offsets, h_row, mask = mask) # h = f * g
tl.store(e + offsets, df_row, mask = mask) # df = DW * f
tl.store(g + offsets, de_row, mask = mask) # de
pass
def geglu_exact_backward_kernel(DW, e, g):
batch_seq_len, hd = e.shape
n_elements = e.numel()
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
_exact_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
return DW, e, g
pass
@triton.jit
def _approx_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
block_idx = tl.program_id(0)
offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
# f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
# f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
# h = f * up
s = 0.7978845608028654 # math.sqrt(2 / math.pi)
e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
f_row = 0.5 * e_row * (
triton_tanh(s * e_row * (1.0 + 0.044715 * e_row * e_row)) \
+ 1.0
)
f_row = f_row.to(g_row.dtype) # Exact copy from HF
h_row = f_row * g_row
# Store h
tl.store(h + offsets, h_row, mask = mask)
pass
def geglu_approx_forward_kernel(gate, up):
batch, seq_len, hd = gate.shape
n_elements = gate.numel()
out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
_approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
return out
pass
@triton.jit
def _approx_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
"""
f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
h = f * up
df/de (with help from https://arxiv.org/pdf/2305.12073.pdf :))
df/de = 1/2 * [1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )] +
1/2 * sech^2 [ sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ] * \
( sqrt(2/pi) * x * (1 + 0.044715 * x^2 * 3 ) )
Notice sech^2(x) = 1 - tanh^2(x)
So reuse tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )
See https://www.desmos.com/calculator/nqprfoni6x
"""
block_idx = tl.program_id(0)
offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
mask = offsets < n_elements
DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
# See https://www.desmos.com/calculator/nqprfoni6x
s = 0.7978845608028654 # math.sqrt(2 / math.pi)
a = s * e_row # a = sqrt(2 / pi) * x
b = a * 0.044715 * e_row * e_row # b = a * 0.044715 * x^2
T = 1.0 + triton_tanh(a + b)
T2 = 0.5 * T
# Q = 0.5 * -T * (T - 2.0) * (a + 3.0 * b)
Q2 = -T2 * (T - 2.0) * (a + 3.0 * b)
df_de = T2 + Q2 # 1/2 * (T + Q)
# f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
f_row = T2 * e_row
f_row = f_row.to(DW_row.dtype)
# h = f * g
h_row = f_row * g_row
# df = DW * f
df_row = DW_row * f_row
# dg = DW * g
dg_row = DW_row * g_row
de_row = dg_row.to(tl.float32) * df_de
de_row = de_row.to(DW_row.dtype)
# Store derivatives in buffers
tl.store(DW + offsets, h_row, mask = mask) # h = f * g
tl.store(e + offsets, df_row, mask = mask) # df = DW * f
tl.store(g + offsets, de_row, mask = mask) # de
pass
def geglu_approx_backward_kernel(DW, e, g):
batch_seq_len, hd = e.shape
n_elements = e.numel()
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
_approx_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
return DW, e, g
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
# Copyright 2024-present Andrej Karpathy & the llm.c team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton
import triton.language as tl
import torch
from .utils import calculate_settings
@triton.jit
def layernorm_forward(
Y, Y_row_stride,
X, X_row_stride,
W,
b,
r,
mu,
n_cols, eps,
BLOCK_SIZE : tl.constexpr
):
row_idx = tl.program_id(0)
col_offsets = tl.arange(0, BLOCK_SIZE)
mask = col_offsets < n_cols
Y += row_idx * Y_row_stride
X += row_idx * X_row_stride
r += row_idx
mu += row_idx
# According to https://pytorch.org/torchtune/stable/_modules/torchtune/modules/layer_norm.html#Fp32LayerNorm, all modules
# are in float32!
X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
b_row = tl.load(b + col_offsets, mask = mask, other = 0).to(tl.float32)
mean_X = tl.sum(X_row, axis = 0) / n_cols
XX = X_row - mean_X
row_var = tl.sum(XX * XX, axis = 0) / n_cols
inv_var = tl.math.rsqrt(row_var + eps)
tl.store (r, inv_var)
tl.store (mu, mean_X)
output = (XX * inv_var) * W_row + b_row
tl.store(Y + col_offsets, output, mask = mask)
pass
@triton.jit
def layernorm_backward(
dY, dY_row_stride,
X, X_row_stride,
W,
b,
r,
mu,
n_cols, eps,
BLOCK_SIZE : tl.constexpr
):
# Approximately follows https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
row_idx = tl.program_id(0)
col_offsets = tl.arange(0, BLOCK_SIZE)
mask = col_offsets < n_cols
dY += row_idx * dY_row_stride
X += row_idx * X_row_stride
r += row_idx
mu += row_idx
# According to https://pytorch.org/torchtune/stable/_modules/torchtune/modules/layer_norm.html#Fp32LayerNorm, all modules
# are in float32!
dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
b_row = tl.load(b + col_offsets, mask = mask, other = 0).to(tl.float32)
inv_var = tl.load(r) .to(tl.float32)
mean = tl.load(mu).to(tl.float32)
normed = (X_row - mean) * inv_var
dY_W = dY_row * W_row
dX_row = dY_W - tl.sum(dY_W, axis = 0) / n_cols - normed * tl.sum(dY_W * normed, axis = 0) / n_cols
dX_row = dX_row * inv_var
tl.store(dY + col_offsets, dX_row, mask = mask)
pass
class Fast_Layernorm(torch.autograd.Function):
@staticmethod
def forward(ctx, X, W, b, eps):
shape = X.shape
dim = shape[-1]
X = X.view(-1, dim)
n_rows, n_cols = X.shape
BLOCK_SIZE, num_warps = calculate_settings(n_cols)
Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = "cuda:0")
r = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
mu = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
layernorm_forward[(n_rows,)](
Y, Y.stride(0),
X, X.stride(0),
W,
b,
r,
mu,
n_cols, eps,
BLOCK_SIZE = BLOCK_SIZE,
num_warps = num_warps,
)
ctx.eps = eps
ctx.BLOCK_SIZE = BLOCK_SIZE
ctx.num_warps = num_warps
ctx.save_for_backward(X, W, b, r, mu)
return Y.view(*shape)
pass
@staticmethod
def backward(ctx, dY):
shape = dY.shape
dim = shape[-1]
dY = dY.view(-1, dim)
X, W, b, r, mu = ctx.saved_tensors
n_rows, n_cols = dY.shape
layernorm_backward[(n_rows,)](
dY, dY.stride(0),
X, X .stride(0),
W,
b,
r,
mu,
n_cols, ctx.eps,
BLOCK_SIZE = ctx.BLOCK_SIZE,
num_warps = ctx.num_warps,
)
dX = dY.view(*shape)
return dX, None, None, None, None
pass
pass
def fast_layernorm(layernorm, X):
assert(layernorm.elementwise_affine is True)
W = layernorm.weight
bias = layernorm.bias
eps = layernorm.variance_epsilon if \
hasattr(layernorm, "variance_epsilon") \
else layernorm.eps
out = Fast_Layernorm.apply(X, W, bias, eps)
return out
pass
from torch.nn import LayerNorm
class Unsloth_LayerNorm(LayerNorm):
def forward(self, X):
return fast_layernorm(self, X)
pass
pass
def patch_layernorm():
import torch.nn
torch.nn.LayerNorm = Unsloth_LayerNorm
return
pass
def unpatch_layernorm():
import torch.nn
torch.nn.LayerNorm = LayerNorm
return
pass
def test_layernorm(
dim = 1024, eps = 1e-5, dtype = torch.float16,
bsz = 21, random_state = 3407, seqlen = 3341,
):
from torch.nn import LayerNorm
layernorm = LayerNorm((dim,), eps = eps, device = "cuda", dtype = dtype)
torch.cuda.manual_seed(random_state)
torch.manual_seed(random_state)
torch.nn.init.uniform_(layernorm.weight)
torch.nn.init.uniform_(layernorm.bias)
X = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda")
XX = X.clone()
X .requires_grad_(True)
XX.requires_grad_(True)
Y = layernorm(X)
YY = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda", requires_grad = True)
Y.backward(YY)
correct_grad = X.grad.clone()
# from unsloth.kernels import fast_layernorm
Y = fast_layernorm(layernorm, XX)
Y.backward(YY)
assert(torch.dist(correct_grad, XX.grad).item() <= 0.1)
pass
def testing_suite_layernorm():
for dim in [512, 1024, 2048]:
for dtype in [torch.float16, torch.bfloat16]:
with torch.autocast(device_type = "cuda", dtype = dtype):
for seqlen in [3341, 2048, 349]:
for random_state in [3407, 42]:
test_layernorm(
dim = dim,
eps = 1e-5,
dtype = dtype,
bsz = 21,
random_state = random_state,
seqlen = seqlen,
)
pass
pass
pass
pass
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton
import triton.language as tl
import torch
from .utils import calculate_settings
@triton.jit
def _rms_layernorm_forward(
Y, Y_row_stride,
X, X_row_stride,
W, W_row_stride,
r, r_row_stride,
n_cols, eps,
BLOCK_SIZE : tl.constexpr
):
"""
Fast RMS Layernorm kernel
Inspiration from a Triton tutorial:
https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
"""
row_idx = tl.program_id(0)
col_offsets = tl.arange(0, BLOCK_SIZE)
mask = col_offsets < n_cols
Y += row_idx * Y_row_stride
X += row_idx * X_row_stride
r += row_idx * r_row_stride
X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
W_row = tl.load(W + col_offsets, mask = mask, other = 0)#.to(tl.float32)
row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
inv_var = tl.math.rsqrt(row_var + eps)
tl.store(r, inv_var)
normed = X_row * inv_var
normed = normed.to(W_row.dtype) # Exact copy from HF
output = normed * W_row
tl.store(Y + col_offsets, output, mask = mask)
pass
@triton.heuristics({"GEMMA": lambda args: args["GEMMA"],})
@triton.jit
def _rms_layernorm_backward(
dY, dY_row_stride,
X, X_row_stride,
W, W_row_stride,
r, r_row_stride,
dW, dW_row_stride,
n_cols, eps,
GEMMA : tl.constexpr,
BLOCK_SIZE : tl.constexpr,
):
"""
Fast RMS Layernorm kernel for the backward pass
Inspiration from a Triton tutorial:
https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
"""
row_idx = tl.program_id(0)
col_offsets = tl.arange(0, BLOCK_SIZE)
mask = col_offsets < n_cols
dY += row_idx * dY_row_stride
X += row_idx * X_row_stride
r += row_idx * r_row_stride
dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
# Get saved row variance
inv_var = tl.load(r).to(tl.float32)
normed = X_row * inv_var
if GEMMA: dY_W = dY_row * (W_row + 1.0)
else: dY_W = dY_row * W_row
rowsum_dY_normed = tl.sum(dY_W * normed, axis = 0)
output = inv_var/n_cols * (n_cols*dY_W - normed*rowsum_dY_normed)
tl.store(dY + col_offsets, output, mask = mask)
pass
@triton.jit
def _gemma_rms_layernorm_forward(
Y, Y_row_stride,
X, X_row_stride,
W, W_row_stride,
r, r_row_stride,
n_cols, eps,
BLOCK_SIZE : tl.constexpr,
):
# Copies https://github.com/google-deepmind/gemma/blob/main/gemma/layers.py#L31
# and https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L33
# exactly. Essentially all in float32!
row_idx = tl.program_id(0)
col_offsets = tl.arange(0, BLOCK_SIZE)
mask = col_offsets < n_cols
Y += row_idx * Y_row_stride
X += row_idx * X_row_stride
r += row_idx * r_row_stride
X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
inv_var = tl.math.rsqrt(row_var + eps)
tl.store(r, inv_var)
normed = X_row * inv_var
output = normed * (W_row + 1.0)
tl.store(Y + col_offsets, output, mask = mask)
pass
class Fast_RMS_Layernorm(torch.autograd.Function):
@staticmethod
def forward(ctx, X, W, eps, gemma = False):
shape = X.shape
dim = shape[-1]
X = X.view(-1, dim)
n_rows, n_cols = X.shape
BLOCK_SIZE, num_warps = calculate_settings(n_cols)
#Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = "cuda:0")
#r = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = X.device)
r = torch.empty(n_rows, dtype = torch.float32, device = X.device)
fx = _gemma_rms_layernorm_forward if gemma else _rms_layernorm_forward
fx[(n_rows,)](
Y, Y.stride(0),
X, X.stride(0),
W, W.stride(0),
r, r.stride(0),
n_cols, eps,
BLOCK_SIZE = BLOCK_SIZE,
num_warps = num_warps,
)
ctx.eps = eps
ctx.BLOCK_SIZE = BLOCK_SIZE
ctx.num_warps = num_warps
ctx.GEMMA = gemma
ctx.save_for_backward(X, W, r)
return Y.view(*shape)
pass
@staticmethod
def backward(ctx, dY):
shape = dY.shape
dim = shape[-1]
dY = dY.view(-1, dim)
X, W, r = ctx.saved_tensors
n_rows, n_cols = dY.shape
dW = X
_rms_layernorm_backward[(n_rows,)](
dY, dY.stride(0),
X, X .stride(0),
W, W .stride(0),
r, r .stride(0),
dW, dW.stride(0),
n_cols, ctx.eps,
GEMMA = ctx.GEMMA,
BLOCK_SIZE = ctx.BLOCK_SIZE,
num_warps = ctx.num_warps,
)
dX = dY.view(*shape)
return dX, None, None, None
pass
pass
def fast_rms_layernorm(layernorm, X, gemma = False):
W = layernorm.weight
eps = layernorm.variance_epsilon if \
hasattr(layernorm, "variance_epsilon") \
else layernorm.eps
out = Fast_RMS_Layernorm.apply(X, W, eps, gemma)
return out
pass
from transformers.models.llama.modeling_llama import LlamaRMSNorm
class Unsloth_LlamaRMSNorm(LlamaRMSNorm):
def forward(self, X):
return fast_rms_layernorm(self, X, gemma = False)
pass
pass
try:
from transformers.models.mllama.modeling_mllama import MllamaTextRMSNorm
class Unsloth_MllamaTextRMSNorm(MllamaTextRMSNorm):
def forward(self, X):
return fast_rms_layernorm(self, X, gemma = False)
pass
pass
except:
pass
pass
def patch_rms_layernorm():
import transformers.models.llama.modeling_llama
transformers.models.llama.modeling_llama.LlamaRMSNorm = Unsloth_LlamaRMSNorm
try:
import transformers.models.mllama.modeling_mllama
transformers.models.mllama.modeling_mllama.MllamaTextRMSNorm = Unsloth_MllamaTextRMSNorm
except:
pass
return
pass
def unpatch_rms_layernorm():
import transformers.models.llama.modeling_llama
transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
try:
import transformers.models.mllama.modeling_mllama
transformers.models.mllama.modeling_mllama.MllamaTextRMSNorm = MllamaTextRMSNorm
except:
pass
return
return
pass
def test_rms_layernorm(
dim = 1024, eps = 1e-5, dtype = torch.float16,
bsz = 21, random_state = 3407, seqlen = 3341,
):
from transformers.models.llama.modeling_llama import LlamaRMSNorm
layernorm = LlamaRMSNorm((dim,), eps = eps).to("cuda")
torch.cuda.manual_seed(random_state)
torch.manual_seed(random_state)
torch.nn.init.uniform_(layernorm.weight)
X = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda")
XX = X.clone()
X .requires_grad_(True)
XX.requires_grad_(True)
Y = layernorm(X)
YY = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda", requires_grad = True)
Y.backward(YY)
correct_grad = X.grad.clone()
# from unsloth.kernels import fast_rms_layernorm
Y = fast_rms_layernorm(layernorm, XX)
Y.backward(YY)
assert(torch.amax(correct_grad - XX.grad).item() <= 0.05)
pass
def testing_suite_layernorm():
for dim in [512, 1024, 2048]:
for dtype in [torch.float16, torch.bfloat16]:
with torch.autocast(device_type = "cuda", dtype = dtype):
for seqlen in [3341, 2048, 349]:
for random_state in [3407, 42]:
test_rms_layernorm(
dim = dim,
eps = 1e-5,
dtype = dtype,
bsz = 21,
random_state = random_state,
seqlen = seqlen,
)
pass
pass
pass
pass
pass
pass
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton
import triton.language as tl
import torch
from .utils import calculate_settings
ROPE_GROUP_SIZE = 4
@triton.heuristics({"BACKWARD_PASS": lambda args: args["BACKWARD_PASS"],})
@triton.jit
def _rope_embedding(
Q, Q_row_stride,
cos, cos_row_stride,
sin, sin_row_stride,
seqlen,
head_dim : tl.constexpr,
n_heads : tl.constexpr,
BACKWARD_PASS : tl.constexpr,
BLOCK_SIZE : tl.constexpr,
):
"""
Calculates the RoPE Embedding quickly
RoPE is Q * cos + rotate_half(Q) * sin
See our blog post for more info
"""
ROPE_GROUP_SIZE = 4
row_position = tl.program_id(0)
group_head_position = tl.program_id(1)
col_offsets = tl.arange(0, BLOCK_SIZE)
half_head_dim = head_dim // 2
mask = col_offsets < half_head_dim
sin1 = tl.load(sin + (row_position % seqlen)*sin_row_stride + \
half_head_dim*0 + col_offsets, mask = mask, other = 0)
cos1 = tl.load(cos + (row_position % seqlen)*cos_row_stride + \
half_head_dim*0 + col_offsets, mask = mask, other = 0)
if BACKWARD_PASS:
# See our blog post for more info.
sin1 = -sin1
pass
# [TODO] Autotune ROPE_GROUP_SIZE to be 1, 2, 4, 8
head_start = group_head_position * ROPE_GROUP_SIZE
head_end = min((head_start + ROPE_GROUP_SIZE), n_heads)
# 10% Faster kernel from [HuyNguyen-hust](https://github.com/unslothai/unsloth/pull/238)
for k in range(head_start, head_end):
offs_q1 = row_position * Q_row_stride + k * head_dim + col_offsets
offs_q2 = row_position * Q_row_stride + k * head_dim + col_offsets + half_head_dim
# For Gemma - sometimes RoPE must be done in float32 and not bfloat16
Q1 = tl.load(Q + offs_q1, mask = mask, other = 0).to(sin1.dtype)
Q2 = tl.load(Q + offs_q2, mask = mask, other = 0).to(sin1.dtype)
tl.store(Q + offs_q1, Q1*cos1 - Q2*sin1, mask = mask)
tl.store(Q + offs_q2, Q2*cos1 + Q1*sin1, mask = mask)
pass
pass
class Fast_RoPE_Embedding(torch.autograd.Function):
@staticmethod
def forward(ctx, Q, cos, sin):
cos, sin = cos.squeeze(), sin.squeeze()
batch, seq_len, n_heads, head_dim = Q.shape
Q = Q.view(batch*seq_len, n_heads*head_dim)
n_rows, n_cols = Q.shape
assert(seq_len <= cos.shape[0])
# [TODO] Changing blocksize to head_dim//2 seems to have
# some concurrency / un-deterministic issues.
BLOCK_SIZE, num_warps = calculate_settings(head_dim//2) # (head_dim//2)
# group_size = 4 # 4 or 8, too large group_size can hurt performance.
div, mod = divmod(n_heads, ROPE_GROUP_SIZE)
n_groups = div + (mod != 0)
_rope_embedding[(n_rows, n_groups, )](
Q, Q.stride(0),
cos, cos.stride(0),
sin, sin.stride(0),
seq_len,
head_dim, n_heads,
BACKWARD_PASS = False,
BLOCK_SIZE = BLOCK_SIZE,
num_warps = num_warps,
)
ctx.BLOCK_SIZE = BLOCK_SIZE
ctx.num_warps = num_warps
ctx.n_groups = n_groups
ctx.cos = cos
ctx.sin = sin
return Q.view(batch, seq_len, n_heads, head_dim)
pass
@staticmethod
def backward(ctx, dY):
batch, seq_len, n_heads, head_dim = dY.shape
dY = dY.reshape(batch*seq_len, n_heads*head_dim)
# Must be reshape not view
n_rows, n_cols = dY.shape
cos = ctx.cos
sin = ctx.sin
_rope_embedding[(n_rows, ctx.n_groups, )](
dY, dY .stride(0),
cos, cos.stride(0),
sin, sin.stride(0),
seq_len, head_dim, n_heads,
BACKWARD_PASS = True,
BLOCK_SIZE = ctx.BLOCK_SIZE,
num_warps = ctx.num_warps,
)
dY = dY.view(batch, seq_len, n_heads, head_dim)
return dY, None, None,
pass
pass
def fast_rope_embedding(Q, K, cos, sin):
Q = Fast_RoPE_Embedding.apply(Q.transpose(1, 2), cos, sin).transpose(1, 2)
K = Fast_RoPE_Embedding.apply(K.transpose(1, 2), cos, sin).transpose(1, 2)
return Q, K
pass
class Slow_RoPE_Embedding(torch.autograd.Function):
@staticmethod
def forward(ctx, Q, cos, sin, position_ids):
if position_ids is not None:
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
# Q * cos + rotate_half(Q) * sin
half = Q.shape[-1]//2
RH_Q = torch.cat((-Q[..., half:], Q[..., :half]), dim = -1)
Q *= cos
Q.addcmul_(RH_Q, sin)
# RH_Q *= sin
# Q += RH_Q
ctx.save_for_backward(cos, sin)
return Q
pass
@staticmethod
def backward(ctx, dY):
cos, sin = ctx.saved_tensors
# Q * cos + rotate_half.T(Q) * sin
half = dY.shape[-1]//2
RH_dY = torch.cat((dY[..., half:], -dY[..., :half]), dim = -1)
dY *= cos
dY.addcmul_(RH_dY, sin)
# RH_dY *= sin
# dY += RH_dY
return dY, None, None, None
pass
pass
def inplace_rope_embedding(Q, K, cos, sin, position_ids):
Q = Slow_RoPE_Embedding.apply(Q, cos, sin, position_ids)
K = Slow_RoPE_Embedding.apply(K, cos, sin, position_ids)
return Q, K
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment