v1.0.3

d3e0fa63 · chenzk · d3e0fa63 · d3e0fa63 · d3e0fa63 · d3e0fa63
Commit d3e0fa63 authored Jul 31, 2024 by chenzk
20 changed files
--- a/train_args/sft/qlora/mixtral-8x7b-sft-qlora.json
+++ b/train_args/sft/qlora/mixtral-8x7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-mixtral-8x7b-sft-qlora",
+    "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "mixtral",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 16,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/orion-14b-sft-qlora.json
+++ b/train_args/sft/qlora/orion-14b-sft-qlora.json
+{
+    "output_dir": "output/firefly-orion-14b-sft-qlora",
+    "model_name_or_path": "OrionStarAI/Orion-14B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "orion",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/phi3-3.8b-sft-qlora.json
+++ b/train_args/sft/qlora/phi3-3.8b-sft-qlora.json
+{
+    "output_dir": "output/firefly-phi3-3.8b-sft-qlora",
+    "model_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "phi3",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/qwen-14b-sft-qlora.json
+++ b/train_args/sft/qlora/qwen-14b-sft-qlora.json
+{
+    "output_dir": "output/firefly-qwen-14b-sft-qlora",
+    "model_name_or_path": "Qwen/Qwen-14B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "qwen",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/qwen-7b-sft-qlora.json
+++ b/train_args/sft/qlora/qwen-7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-qwen-7b-sft-qlora",
+    "model_name_or_path": "Qwen/Qwen-7B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "qwen",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/qwen1.5-7b-sft-qlora.json
+++ b/train_args/sft/qlora/qwen1.5-7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-qwen1.5-7b-sft-qlora",
+    "model_name_or_path": "Qwen/Qwen1.5-7B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "qwen",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "use_unsloth": false,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/qwen1.5-moe-2.7b-sft-qlora.json
+++ b/train_args/sft/qlora/qwen1.5-moe-2.7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-qwen1.5-moe-2.7b-sft-qlora",
+    "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "qwen",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/qwen2-7b-sft-qlora.json
+++ b/train_args/sft/qlora/qwen2-7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-qwen2-7b-sft-qlora",
+    "model_name_or_path": "Qwen/Qwen2-7B-Instruct",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "qwen",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "use_unsloth": false,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/vicuna-13b-sft-qlora.json
+++ b/train_args/sft/qlora/vicuna-13b-sft-qlora.json
+{
+    "output_dir": "output/firefly-vicuna-13b-sft-qlora",
+    "model_name_or_path": "lmsys/vicuna-13b-v1.5",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "vicuna",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 5,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/vicuna-7b-sft-qlora.json
+++ b/train_args/sft/qlora/vicuna-7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-vicuna-7b-sft-qlora",
+    "model_name_or_path": "lmsys/vicuna-7b-v1.5",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "vicuna",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 5,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/xverse-13b-sft-qlora.json
+++ b/train_args/sft/qlora/xverse-13b-sft-qlora.json
+{
+    "output_dir": "output/firefly-xverse-13b-sft-qlora",
+    "model_name_or_path": "xverse/XVERSE-13B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "xverse",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 5,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/xverse-7b-sft-qlora.json
+++ b/train_args/sft/qlora/xverse-7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-xverse-7b-sft-qlora",
+    "model_name_or_path": "xverse/XVERSE-7B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "xverse",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 5,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/yi-34b-sft-qlora.json
+++ b/train_args/sft/qlora/yi-34b-sft-qlora.json
+{
+    "output_dir": "output/firefly-yi-34b-sft-qlora",
+    "model_name_or_path": "01-ai/Yi-34B",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "yi",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/yi-6b-sft-qlora.json
+++ b/train_args/sft/qlora/yi-6b-sft-qlora.json
+{
+    "output_dir": "output/firefly-yi-6b-sft-qlora",
+    "model_name_or_path": "01-ai/Yi-6B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "yi",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/yi1.5-6b-sft-qlora.json
+++ b/train_args/sft/qlora/yi1.5-6b-sft-qlora.json
+{
+    "output_dir": "output/firefly-yi1.5-6b-sft-qlora",
+    "model_name_or_path": "01-ai/Yi-1.5-6B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "yi",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/zephyr-7b-sft-qlora.json
+++ b/train_args/sft/qlora/zephyr-7b-sft-qlora.json
+{
+    "output_dir": "output/firefly-zephyr-7b-sft-qlora",
+    "model_name_or_path": "HuggingFaceH4/zephyr-7b-beta",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "zephyr",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 2e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "use_unsloth": true,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 0,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/train_args/sft/qlora/ziya2-13b-sft-qlora.json
+++ b/train_args/sft/qlora/ziya2-13b-sft-qlora.json
+{
+    "output_dir": "output/firefly-ziya2-13b-sft-qlora",
+    "model_name_or_path": "IDEA-CCNL/Ziya2-13B-Chat",
+    "train_file": "./data/dummy_data.jsonl",
+    "template_name": "ziya2",
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "learning_rate": 1e-4,
+    "max_seq_length": 1024,
+    "logging_steps": 100,
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "lr_scheduler_type": "constant_with_warmup",
+    "warmup_steps": 100,
+    "lora_rank": 64,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+
+    "gradient_checkpointing": true,
+    "disable_tqdm": false,
+    "optim": "paged_adamw_32bit",
+    "seed": 42,
+    "fp16": true,
+    "report_to": "tensorboard",
+    "dataloader_num_workers": 10,
+    "save_strategy": "steps",
+    "weight_decay": 0,
+    "max_grad_norm": 0.3,
+    "remove_unused_columns": false
+}
\ No newline at end of file
--- a/unsloth.zip
+++ b/unsloth.zip
--- a/unsloth/LICENSE
+++ b/unsloth/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2024-] [Unsloth AI, Daniel Han-Chen & Michael Han-Chen]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/unsloth/README.md
+++ b/unsloth/README.md
+<div align="center">
+
+  <a href="https://unsloth.ai"><picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20white%20text.png">
+    <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png">
+    <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
+  </picture></a>
+  
+<a href="https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
+<a href="https://discord.gg/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
+<a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
+
+### Finetune Llama 3.1, Mistral, Phi-3 & Gemma 2-5x faster with 80% less memory!
+
+![](https://i.ibb.co/sJ7RhGG/image-41.png)
+
+</div>
+
+## ✨ Finetune for Free
+
+All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, Ollama, vLLM or uploaded to Hugging Face.
+
+| Unsloth supports | Free Notebooks | Performance | Memory use |
+|-----------|---------|--------|----------|
+| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing)               | 2x faster | 60% less |
+| **Mistral Nemo (12B)** | [▶️ Start for free](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)               | 2x faster | 60% less |
+| **Gemma 2 (9B)**      | [▶️ Start for free](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)               | 2x faster | 63% less |
+| **Phi-3 (mini)** | [▶️ Start for free](https://colab.research.google.com/drive/1lN6hPQveB_mHSnTOYifygFcrO8C1bxq4?usp=sharing)               | 2x faster | 50% less |
+| **Ollama**     | [▶️ Start for free](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)               | 1.9x faster | 43% less |
+| **Mistral v0.3 (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing)               | 2.2x faster | 73% less |
+| **ORPO**     | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
+| **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
+
+- **Kaggle Notebooks** for [Llama 3.1 (8B)](https://www.kaggle.com/danielhanchen/kaggle-llama-3-1-8b-unsloth-notebook), [Gemma 2 (9B)](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral (7B)](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- Run [Llama 3 conversational notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing) and [Mistral v0.3 ChatML](https://colab.research.google.com/drive/15F1xyn8497_dUbxZP4zWmPZ3PJx1Oymv?usp=sharing)
+- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text
+- This [continued pretraining notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) is for learning another language
+- Click [here](https://github.com/unslothai/unsloth/wiki) for detailed documentation for Unsloth.
+
+## 🦥 Unsloth.ai News
+- 📣 NEW! [Llama 3.1 8b, 70b](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing) both Base and Instruct now supported
+- 📣 NEW! [Mistral Nemo-12b](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing) both Base and Instruct now supported
+- 📣 NEW! [Gemma-2-9b](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing) and Gemma-2-27b now supported
+- 📣 UPDATE! [Phi-3 mini](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) model updated. [Phi-3 Medium](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing) 2x faster finetuning.
+- 📣 NEW! Continued Pretraining [notebook](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing) for other languages like Korean!
+- 📣 NEW! Qwen2 now works
+- 📣 [Mistral v0.3 Base](https://colab.research.google.com/drive/1_yNCks4BTD5zOnjozppphh5GzMFaMKq_?usp=sharing) and [Mistral v0.3 Instruct]
+- 📣 [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here + [2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models
+- 📣 We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support [4x longer context windows](https://unsloth.ai/blog/long-context)!
+
+## 🔗 Links and Resources
+| Type                            | Links                               |
+| ------------------------------- | --------------------------------------- |
+| 📚 **Documentation & Wiki**              | [Read Our Wiki](https://github.com/unslothai/unsloth/wiki) |
+| <img height="14" src="https://upload.wikimedia.org/wikipedia/commons/6/6f/Logo_of_Twitter.svg" />&nbsp; **Twitter (aka X)**              |  [Follow us on X](https://twitter.com/unslothai)|
+| 💾 **Installation**               | [unsloth/README.md](https://github.com/unslothai/unsloth/tree/main#installation-instructions)|
+| 🥇 **Benchmarking**                   | [Performance Tables](https://github.com/unslothai/unsloth/tree/main#-performance-benchmarking)
+| 🌐 **Released Models**            | [Unsloth Releases](https://huggingface.co/unsloth)|
+| ✍️ **Blog**                    | [Read our Blogs](https://unsloth.ai/blog)|
+
+## ⭐ Key Features
+- All kernels written in [OpenAI's Triton](https://openai.com/research/triton) language. **Manual backprop engine**.
+- **0% loss in accuracy** - no approximation methods - all exact.
+- No change of hardware. Supports NVIDIA GPUs since 2018+. Minimum CUDA Capability 7.0 (V100, T4, Titan V, RTX 20, 30, 40x, A100, H100, L40 etc) [Check your GPU!](https://developer.nvidia.com/cuda-gpus) GTX 1070, 1080 works, but is slow.
+- Works on **Linux** and **Windows** via WSL.
+- Supports 4bit and 16bit QLoRA / LoRA finetuning via [bitsandbytes](https://github.com/TimDettmers/bitsandbytes).
+- Open source trains 5x faster - see [Unsloth Pro](https://unsloth.ai/) for up to **30x faster training**!
+- If you trained a model with 🦥Unsloth, you can use this cool sticker! &nbsp; <img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/made with unsloth.png" height="50" align="center" />
+
+
+## 🥇 Performance Benchmarking
+- For the full list of **reproducible** benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables)
+
+| 1 A100 40GB  | 🤗Hugging Face | Flash Attention | 🦥Unsloth Open Source | 🦥[Unsloth Pro](https://unsloth.ai/pricing) |
+|--------------|--------------|-----------------|---------------------|-----------------|
+| Alpaca       | 1x           | 1.04x           | 1.98x               | **15.64x**      |
+| LAION Chip2  | 1x           | 0.92x           | 1.61x               | **20.73x**      |
+| OASST        | 1x           | 1.19x           | 2.17x               | **14.83x**      |
+| Slim Orca    | 1x           | 1.18x           | 2.22x               | **14.82x**      |
+
+- Benchmarking table below was conducted by [🤗Hugging Face](https://huggingface.co/blog/unsloth-trl).
+
+| Free Colab T4 | Dataset | 🤗Hugging Face | Pytorch 2.1.1 | 🦥Unsloth | 🦥 VRAM reduction |
+| --- | --- | --- | --- | --- | --- |
+| Llama-2 7b | OASST | 1x | 1.19x | 1.95x | -43.3% |
+| Mistral 7b | Alpaca | 1x | 1.07x | 1.56x | -13.7% |
+| Tiny Llama 1.1b | Alpaca | 1x | 2.06x | 3.87x | -73.8% |
+| DPO with Zephyr | Ultra Chat | 1x | 1.09x | 1.55x | -18.6% |
+
+![](https://i.ibb.co/sJ7RhGG/image-41.png)
+
+## 💾 Installation Instructions
+### Conda Installation
+Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA 12.1. If you have `mamba`, use `mamba` instead of `conda` for faster solving. See this [Github issue](https://github.com/unslothai/unsloth/issues/73) for help on debugging Conda installs.
+```bash
+conda create --name unsloth_env \
+    python=3.10 \
+    pytorch-cuda=<11.8/12.1> \
+    pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
+    -y
+conda activate unsloth_env
+
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+
+pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes
+```
+
+### Pip Installation
+Do **NOT** use this if you have Anaconda. You must use the Conda install method, or else stuff will BREAK.
+
+1. Find your CUDA version via
+```python
+import torch; torch.version.cuda
+```
+2. For Pytorch 2.1.0: You can update Pytorch via Pip (interchange `cu121` / `cu118`). Go to https://pytorch.org/ to learn more. Select either `cu118` for CUDA 11.8 or `cu121` for CUDA 12.1. If you have a RTX 3060 or higher (A100, H100 etc), use the `"ampere"` path. For Pytorch 2.1.1: go to step 3. For Pytorch 2.2.0: go to step 4.
+```bash
+pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton \
+  --index-url https://download.pytorch.org/whl/cu121
+```
+```bash
+pip install "unsloth[cu118] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere] @ git+https://github.com/unslothai/unsloth.git"
+```
+3. For Pytorch 2.1.1: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
+```bash
+pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.1 triton \
+  --index-url https://download.pytorch.org/whl/cu121
+```
+```bash
+pip install "unsloth[cu118-torch211] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch211] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch211] @ git+https://github.com/unslothai/unsloth.git"
+```
+4. For Pytorch 2.2.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
+```bash
+pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
+  --index-url https://download.pytorch.org/whl/cu121
+```
+```bash
+pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"
+```
+5. If you get errors, try the below first, then go back to step 1:
+```bash
+pip install --upgrade pip
+```
+6. For Pytorch 2.2.1:
+```bash
+# RTX 3090, 4090 Ampere GPUs:
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
+
+# Pre Ampere RTX 2080, T4, GTX 1080 GPUs:
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
+```
+7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
+```bash
+pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+```
+8. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
+```bash
+nvcc
+python -m xformers.info
+python -m bitsandbytes
+```
+
+## 📜 Documentation
+- Go to our [Wiki page](https://github.com/unslothai/unsloth/wiki) for saving to GGUF, checkpointing, evaluation and more!
+- We support Huggingface's TRL, Trainer, Seq2SeqTrainer or even Pytorch code!
+- We're in 🤗Hugging Face's official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)!
+
+```python
+from unsloth import FastLanguageModel 
+from unsloth import is_bfloat16_supported
+import torch
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
+# Get LAION dataset
+url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
+dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
+
+# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
+fourbit_models = [
+    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
+    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
+    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
+    "unsloth/llama-3-8b-Instruct-bnb-4bit",
+    "unsloth/llama-3-70b-bnb-4bit",
+    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
+    "unsloth/Phi-3-medium-4k-instruct",
+    "unsloth/mistral-7b-bnb-4bit",
+    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
+] # More models at https://huggingface.co/unsloth
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/llama-3-8b-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+    use_rslora = False,  # We support rank stabilized LoRA
+    loftq_config = None, # And LoftQ
+)
+
+trainer = SFTTrainer(
+    model = model,
+    train_dataset = dataset,
+    dataset_text_field = "text",
+    max_seq_length = max_seq_length,
+    tokenizer = tokenizer,
+    args = TrainingArguments(
+        per_device_train_batch_size = 2,
+        gradient_accumulation_steps = 4,
+        warmup_steps = 10,
+        max_steps = 60,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        output_dir = "outputs",
+        optim = "adamw_8bit",
+        seed = 3407,
+    ),
+)
+trainer.train()
+
+# Go to https://github.com/unslothai/unsloth/wiki for advanced tips like
+# (1) Saving to GGUF / merging to 16bit for vLLM
+# (2) Continued training from a saved LoRA adapter
+# (3) Adding an evaluation loop / OOMs
+# (4) Customized chat templates
+```
+
+<a name="DPO"></a>
+## DPO Support
+DPO (Direct Preference Optimization), PPO, Reward Modelling all seem to work as per 3rd party independent testing from [Llama-Factory](https://github.com/hiyouga/LLaMA-Factory). We have a preliminary Google Colab notebook for reproducing Zephyr on Tesla T4 here: [notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing).
+
+We're in 🤗Hugging Face's official docs! We're on the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and the [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)!
+
+```python
+from unsloth import FastLanguageModel, PatchDPOTrainer
+from unsloth import is_bfloat16_supported
+PatchDPOTrainer()
+import torch
+from transformers import TrainingArguments
+from trl import DPOTrainer
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/zephyr-sft-bnb-4bit",
+    max_seq_length = max_seq_length,
+    dtype = None,
+    load_in_4bit = True,
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 64,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 64,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    max_seq_length = max_seq_length,
+)
+
+dpo_trainer = DPOTrainer(
+    model = model,
+    ref_model = None,
+    args = TrainingArguments(
+        per_device_train_batch_size = 4,
+        gradient_accumulation_steps = 8,
+        warmup_ratio = 0.1,
+        num_train_epochs = 3,
+        fp16 = not is_bfloat16_supported(),
+        bf16 = is_bfloat16_supported(),
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        seed = 42,
+        output_dir = "outputs",
+    ),
+    beta = 0.1,
+    train_dataset = YOUR_DATASET_HERE,
+    # eval_dataset = YOUR_DATASET_HERE,
+    tokenizer = tokenizer,
+    max_length = 1024,
+    max_prompt_length = 512,
+)
+dpo_trainer.train()
+```
+
+## 🥇 Detailed Benchmarking Tables
+- Click "Code" for fully reproducible examples
+- "Unsloth Equal" is a preview of our PRO version, with code stripped out. All settings and the loss curve remains identical.
+- For the full list of benchmarking tables, [go to our website](https://unsloth.ai/blog/mistral-benchmark#Benchmark%20tables)
+  
+| 1 A100 40GB | 🤗Hugging Face | Flash Attention 2 | 🦥Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|-------------|-------------|-----------------|--------------|---------------|-------------|
+| Alpaca       | 1x          | 1.04x       | 1.98x           | 2.48x        | 5.32x         | **15.64x**      |
+| code | [Code](https://colab.research.google.com/drive/1u4dBeM-0vGNVmmO6X7cScAut-Hyt4KDF?usp=sharing) |    [Code](https://colab.research.google.com/drive/1fgTOxpMbVjloQBvZyz4lF4BacKSZOB2A?usp=sharing) |    [Code](https://colab.research.google.com/drive/1YIPY_18xm-K0iJDgvNkRoJsgkPMPAO3G?usp=sharing) |    [Code](https://colab.research.google.com/drive/1ANW8EFL3LVyTD7Gq4TkheC1Z7Rxw-rHp?usp=sharing) | | |
+| seconds| 1040 | 1001 | 525 | 419 | 196 | 67  |
+| memory MB| 18235 | 15365 | 9631 | 8525 | | |
+| % saved| | 15.74 | 47.18 | 53.25 | | | |
+
+### Llama-Factory 3rd party benchmarking
+- [Link to performance table.](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-Comparison) TGS: tokens per GPU per second. Model: LLaMA2-7B. GPU: NVIDIA A100 * 1. Batch size: 4. Gradient accumulation: 2. LoRA rank: 8. Max length: 1024.
+
+| Method | Bits | TGS | GRAM | Speed |
+| --- | --- | --- | --- | --- |
+| HF | 16 | 2392 | 18GB | 100% |
+| HF+FA2 | 16 | 2954 | 17GB | 123% |
+| Unsloth+FA2 | 16 | 4007 | 16GB | **168%** |
+| HF | 4 | 2415 | 9GB | 101% |
+| Unsloth+FA2 | 4 | 3726 | 7GB | **160%** |
+
+### Performance comparisons between popular models
+<details>
+  <summary>Click for specific model benchmarking tables (Mistral 7b, CodeLlama 34b etc.)</summary>
+  
+### Mistral 7b
+| 1 A100 40GB | Hugging Face | Flash Attention 2 | Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|-------------|-------------|-----------------|--------------|---------------|-------------|
+| Mistral 7B Slim Orca  | 1x | 1.15x        | 2.15x        | 2.53x            | 4.61x         | **13.69x**         |
+| code | [Code](https://colab.research.google.com/drive/1mePk3KzwTD81hr5mcNcs_AX3Kbg_Ha0x?usp=sharing) | [Code](https://colab.research.google.com/drive/1dgHxjvTmX6hb0bPcLp26RXSE6_n9DKj7?usp=sharing) | [Code](https://colab.research.google.com/drive/1SKrKGV-BZoU4kv5q3g0jtE_OhRgPtrrQ?usp=sharing) | [Code](https://colab.research.google.com/drive/18yOiyX0T81mTwZqOALFSCX_tSAqju6aD?usp=sharing) | |
+| seconds      | 1813        | 1571        | 842             | 718          | 393           | 132         |
+| memory MB    | 32853       | 19385       | 12465           | 10271        |          |        |
+| % saved| | 40.99      | 62.06       | 68.74           |         |          |
+
+### CodeLlama 34b
+| 1 A100 40GB | Hugging Face | Flash Attention 2 | Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|-------------|-------------|-----------------|--------------|---------------|-------------|
+| Code Llama 34B   | OOM ❌         | 0.99x        | 1.87x           | 2.61x        | 4.27x      | 12.82x      |
+| code | [▶️ Code](https://colab.research.google.com/drive/1ykfz3BqrtC_AUFegCzUQjjfUNlxp6Otc?usp=sharing) | [Code](https://colab.research.google.com/drive/12ZypxQh7OC6kBXvWZI-5d05I4m-B_hoR?usp=sharing) | [Code](https://colab.research.google.com/drive/1gdHyAx8XJsz2yNV-DHvbHjR1iCef5Qmh?usp=sharing) | [Code](https://colab.research.google.com/drive/1fm7wqx9MJ0kRrwKOfmLkK1Rmw-pySahB?usp=sharing) | |
+| seconds      | 1953  | 1982  | 1043  | 748   | 458   | 152   |
+| memory MB    | 40000 | 33217 | 27413 | 22161 |       | |
+| % saved|    | 16.96| 31.47 | 44.60 |       | | |
+
+### 1 Tesla T4
+
+| 1 T4 16GB  | Hugging Face | Flash Attention | Unsloth Open    | Unsloth Pro Equal | Unsloth Pro   | Unsloth Max |
+|--------------|-------------|-----------------|-----------------|---------------|---------------|-------------|
+| Alpaca       | 1x          | 1.09x           | 1.69x           | 1.79x         | 2.93x          | **8.3x**        |
+| code | [▶️ Code](https://colab.research.google.com/drive/1XpLIV4s8Bj5uryB-X2gqM88oRGHEGdaB?usp=sharing) |    [Code](https://colab.research.google.com/drive/1LyXu6CjuymQg6ddHX8g1dpUvrMa1nn4L?usp=sharing) |    [Code](https://colab.research.google.com/drive/1gsv4LpY7C32otl1rgRo5wXTk4HIitXoM?usp=sharing) |    [Code](https://colab.research.google.com/drive/1VtULwRQwhEnVdNryjm27zXfdSM1tNfFK?usp=sharing) | | |
+| seconds       | 1599        | 1468        | 942             | 894          | 545           | 193         |
+| memory MB       | 7199        | 7059        | 6459            | 5443         |               |             |
+| % saved        |         | 1.94        | 10.28           | 24.39        |               | |
+
+### 2 Tesla T4s via DDP
+
+ | 2 T4 DDP | Hugging Face | Flash Attention | Unsloth Open | Unsloth Equal | Unsloth Pro | Unsloth Max |
+|--------------|----------|-------------|-----------------|--------------|---------------|-------------|
+| Alpaca       | 1x       | 0.99x       | 4.95x           | 4.44x        | 7.28x         | **20.61x**      |
+| code | [▶️ Code](https://www.kaggle.com/danielhanchen/hf-original-alpaca-t4-ddp) |   [Code](https://www.kaggle.com/danielhanchen/hf-sdpa-alpaca-t4-ddp) |   [Code](https://www.kaggle.com/danielhanchen/unsloth-alpaca-t4-ddp) | | |
+| seconds       | 9882     | 9946        | 1996            | 2227         | 1357          | 480         |
+| memory MB| 9176 | 9128 | 6904 | 6782 |  | |
+| % saved |     | 0.52 | 24.76 | 26.09 |  | | |
+</details>
+
+### Performance comparisons on 1 Tesla T4 GPU:
+<details>
+  <summary>Click for Time taken for 1 epoch</summary>
+
+One Tesla T4 on Google Colab
+`bsz = 2, ga = 4, max_grad_norm = 0.3, num_train_epochs = 1, seed = 3047, lr = 2e-4, wd = 0.01, optim = "adamw_8bit", schedule = "linear", schedule_steps = 10`
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 1 T4 | 23h 15m | 56h 28m | 8h 38m | 391h 41m |
+| Unsloth Open | 1 T4 | 13h 7m (1.8x) | 31h 47m (1.8x) | 4h 27m (1.9x) | 240h 4m (1.6x) |
+| Unsloth Pro | 1 T4 | 3h 6m (7.5x) | 5h 17m (10.7x) | 1h 7m (7.7x) | 59h 53m (6.5x) |
+| Unsloth Max | 1 T4 | 2h 39m (8.8x) | 4h 31m (12.5x) | 0h 58m (8.9x) | 51h 30m (7.6x) |
+
+**Peak Memory Usage**
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 1 T4 | 7.3GB | 5.9GB | 14.0GB | 13.3GB |
+| Unsloth Open | 1 T4 | 6.8GB | 5.7GB | 7.8GB | 7.7GB |
+| Unsloth Pro | 1 T4 | 6.4GB | 6.4GB | 6.4GB | 6.4GB |
+| Unsloth Max | 1 T4 | 11.4GB | 12.4GB | 11.9GB | 14.4GB |
+</details>
+
+<details>
+  <summary>Click for Performance Comparisons on 2 Tesla T4 GPUs via DDP:</summary>
+**Time taken for 1 epoch**
+
+Two Tesla T4s on Kaggle
+`bsz = 2, ga = 4, max_grad_norm = 0.3, num_train_epochs = 1, seed = 3047, lr = 2e-4, wd = 0.01, optim = "adamw_8bit", schedule = "linear", schedule_steps = 10`
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) * |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 2 T4 | 84h 47m | 163h 48m | 30h 51m | 1301h 24m * |
+| Unsloth Pro | 2 T4 | 3h 20m (25.4x) | 5h 43m (28.7x) | 1h 12m (25.7x) | 71h 40m (18.1x) * |
+| Unsloth Max | 2 T4 | 3h 4m (27.6x) | 5h 14m (31.3x) | 1h 6m (28.1x) | 54h 20m (23.9x) * |
+
+**Peak Memory Usage on a Multi GPU System (2 GPUs)**
+
+| System | GPU | Alpaca (52K) | LAION OIG (210K) | Open Assistant (10K) | SlimOrca (518K) * |
+| --- | --- | --- | --- | --- | --- |
+| Huggingface | 2 T4 | 8.4GB \| 6GB | 7.2GB \| 5.3GB | 14.3GB \| 6.6GB | 10.9GB \| 5.9GB * |
+| Unsloth Pro | 2 T4 | 7.7GB \| 4.9GB | 7.5GB \| 4.9GB | 8.5GB \| 4.9GB | 6.2GB \| 4.7GB * |
+| Unsloth Max | 2 T4 | 10.5GB \| 5GB | 10.6GB \| 5GB | 10.6GB \| 5GB | 10.5GB \| 5GB * |
+
+* Slim Orca `bsz=1` for all benchmarks since `bsz=2` OOMs. We can handle `bsz=2`, but we benchmark it with `bsz=1` for consistency.
+</details>
+
+![](https://i.ibb.co/sJ7RhGG/image-41.png)
+<br>
+
+### Thank You to
+- [HuyNguyen-hust](https://github.com/HuyNguyen-hust) for making [RoPE Embeddings 28% faster](https://github.com/unslothai/unsloth/pull/238)
+- [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support
+- [152334H](https://github.com/152334H) for experimental DPO support
+- [atgctg](https://github.com/atgctg) for syntax highlighting