Commit ccfcffb1 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #805 canceled with stages
python scripts/convert_hf_checkpoint.py --checkpoint_dir out/TinyLlama-1.1B-900B --model_name tiny_LLaMA_1b
python test_weight.py --checkpoint_dir out/TinyLlama-1.1B-intermediate-900B
python pretrain/tinyllama_code.py --devices 8 --train_data_dir data/code_specialist_python_java_javascript_c_go_8192
python scripts/prepare_starcoder.py --source_path data/starcoderdata/ --tokenizer_path data/llama --destination_path data/code_specialist_python_java_javascript_c_go_8192 --split train --percentage 1.0 --filenames_subset ["python","cpp","go","java","javascript"] --chunk_size 4194816
/data/TinyLlama/out/code_tiny_LLaMA_1b_python_java_go_cpp_javascript/iter-032000-ckpt.pth
python scripts/convert_lit_checkpoint.py --out_dir /data/TinyLlama/out/tiny_LLaMA_1b/ --checkpoint_name iter-100000-ckpt.pth --model_name tiny_LLaMA_1b
\ No newline at end of file
import contextlib
import gc
import json
import sys
from functools import partial
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, Union
import torch
# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
from lit_gpt import Config
from lit_gpt.utils import NotYetLoadedTensor, incremental_save, lazy_load
def copy_weights_gpt_neox(
state_dict: Dict[str, torch.Tensor],
hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
saver: Optional[incremental_save] = None,
dtype: Optional[torch.dtype] = None,
) -> None:
weight_map = {
"gpt_neox.embed_in.weight": "transformer.wte.weight",
"gpt_neox.layers.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias",
"gpt_neox.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
"gpt_neox.layers.{}.attention.query_key_value.bias": "transformer.h.{}.attn.attn.bias",
"gpt_neox.layers.{}.attention.query_key_value.weight": "transformer.h.{}.attn.attn.weight",
"gpt_neox.layers.{}.attention.dense.bias": "transformer.h.{}.attn.proj.bias",
"gpt_neox.layers.{}.attention.dense.weight": "transformer.h.{}.attn.proj.weight",
"gpt_neox.layers.{}.attention.rotary_emb.inv_freq": None,
"gpt_neox.layers.{}.attention.bias": None,
"gpt_neox.layers.{}.attention.masked_bias": None,
"gpt_neox.layers.{}.post_attention_layernorm.bias": "transformer.h.{}.norm_2.bias",
"gpt_neox.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
"gpt_neox.layers.{}.mlp.dense_h_to_4h.bias": "transformer.h.{}.mlp.fc.bias",
"gpt_neox.layers.{}.mlp.dense_h_to_4h.weight": "transformer.h.{}.mlp.fc.weight",
"gpt_neox.layers.{}.mlp.dense_4h_to_h.bias": "transformer.h.{}.mlp.proj.bias",
"gpt_neox.layers.{}.mlp.dense_4h_to_h.weight": "transformer.h.{}.mlp.proj.weight",
"gpt_neox.final_layer_norm.bias": "transformer.ln_f.bias",
"gpt_neox.final_layer_norm.weight": "transformer.ln_f.weight",
"embed_out.weight": "lm_head.weight",
}
for name, param in hf_weights.items():
if "gpt_neox.layers" in name:
from_name, number = layer_template(name, 2)
to_name = weight_map[from_name]
if to_name is None:
continue
to_name = to_name.format(number)
else:
to_name = weight_map[name]
param = load_param(param, name, dtype)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
def copy_weights_falcon(
size: Literal["7b", "40b"],
state_dict: Dict[str, torch.Tensor],
hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
saver: Optional[incremental_save] = None,
dtype: Optional[torch.dtype] = None,
) -> None:
weight_map = {
"transformer.word_embeddings.weight": "transformer.wte.weight",
"transformer.h.{}.self_attention.query_key_value.weight": "transformer.h.{}.attn.attn.weight",
"transformer.h.{}.self_attention.dense.weight": "transformer.h.{}.attn.proj.weight",
"transformer.h.{}.mlp.dense_h_to_4h.weight": "transformer.h.{}.mlp.fc.weight",
"transformer.h.{}.mlp.dense_4h_to_h.weight": "transformer.h.{}.mlp.proj.weight",
"transformer.ln_f.bias": "transformer.ln_f.bias",
"transformer.ln_f.weight": "transformer.ln_f.weight",
"lm_head.weight": "lm_head.weight",
}
# the original model definition is different for each size
if size == "7b":
weight_map.update(
{
"transformer.h.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias",
"transformer.h.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
}
)
elif size == "40b":
weight_map.update(
{
"transformer.h.{}.ln_attn.bias": "transformer.h.{}.norm_1.bias",
"transformer.h.{}.ln_attn.weight": "transformer.h.{}.norm_1.weight",
"transformer.h.{}.ln_mlp.bias": "transformer.h.{}.norm_2.bias",
"transformer.h.{}.ln_mlp.weight": "transformer.h.{}.norm_2.weight",
}
)
else:
raise NotImplementedError
for name, param in hf_weights.items():
if "transformer.h" in name:
from_name, number = layer_template(name, 2)
to_name = weight_map[from_name].format(number)
else:
to_name = weight_map[name]
param = load_param(param, name, dtype)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
def copy_weights_hf_llama(
config: Config,
qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
state_dict: Dict[str, torch.Tensor],
hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
saver: Optional[incremental_save] = None,
dtype: Optional[torch.dtype] = None,
) -> None:
weight_map = {
"model.embed_tokens.weight": "transformer.wte.weight",
"model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
"model.layers.{}.self_attn.q_proj.weight": None,
"model.layers.{}.self_attn.k_proj.weight": None,
"model.layers.{}.self_attn.v_proj.weight": None,
"model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
"model.layers.{}.self_attn.rotary_emb.inv_freq": None,
"model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
"model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.swiglu.w1.weight",
"model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.swiglu.w2.weight",
"model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.swiglu.w3.weight",
"model.norm.weight": "transformer.ln_f.weight",
"lm_head.weight": "lm_head.weight",
}
for name, param in hf_weights.items():
if "model.layers" in name:
from_name, number = layer_template(name, 2)
qkv = qkv_weights.setdefault(number, [None, None, None])
if "q_proj" in name:
qkv[0] = param
elif "k_proj" in name:
qkv[1] = param
elif "v_proj" in name:
qkv[2] = param
to_name = weight_map[from_name]
if to_name is None:
continue
to_name = to_name.format(number)
else:
to_name = weight_map[name]
param = load_param(param, name, dtype)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
for i, (q, k, v) in list(qkv_weights.items()):
if q is None or k is None or v is None:
# split across different .bin files
continue
q = load_param(q, f"layer {i} q", dtype)
k = load_param(k, f"layer {i} k", dtype)
v = load_param(v, f"layer {i} v", dtype)
q_per_kv = config.n_head // config.n_query_groups
qs = torch.split(q, config.head_size * q_per_kv)
ks = torch.split(k, config.head_size)
vs = torch.split(v, config.head_size)
cycled = [t for group in zip(qs, ks, vs) for t in group]
qkv = torch.cat(cycled)
state_dict[f"transformer.h.{i}.attn.attn.weight"] = qkv
del qkv_weights[i]
def layer_template(layer_name: str, idx: int) -> Tuple[str, int]:
split = layer_name.split(".")
number = int(split[idx])
split[idx] = "{}"
from_name = ".".join(split)
return from_name, number
def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: Optional[torch.dtype]) -> torch.Tensor:
if hasattr(param, "_load_tensor"):
# support tensors loaded via `lazy_load()`
print(f"Loading {name!r} into RAM")
param = param._load_tensor()
if dtype is not None and type(dtype) is not NotYetLoadedTensor and dtype != param.dtype:
print(f"Converting {name!r} from {param.dtype} to {dtype}")
param = param.to(dtype)
return param
@torch.inference_mode()
def convert_hf_checkpoint(
*,
checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
model_name: Optional[str] = None,
dtype: Optional[str] = None,
) -> None:
if model_name is None:
model_name = checkpoint_dir.name
if dtype is not None:
dtype = getattr(torch, dtype)
config = Config.from_name(model_name)
print(f"Model config {config.__dict__}")
with open(checkpoint_dir / "lit_config.json", "w") as json_config:
json.dump(config.__dict__, json_config)
if "falcon" in model_name:
copy_fn = partial(copy_weights_falcon, "40b" if config.n_embd == 8192 else "7b")
elif config._mlp_class == "LLaMAMLP":
# holder to reconstitute the split q, k, v
qkv_weights = {}
copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
else:
copy_fn = copy_weights_gpt_neox
# initialize a new empty state dict to hold our new weights
sd = {}
# Load the json file containing weight mapping
pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json"
if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file
with open(pytorch_bin_map_json_path) as json_map:
bin_index = json.load(json_map)
bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
else:
bin_files = set(checkpoint_dir.glob("*.bin"))
if not bin_files:
raise ValueError(f"Expected {str(checkpoint_dir)!r} to contain .bin files")
with incremental_save(checkpoint_dir / "lit_model.pth") as saver:
# for checkpoints that split the QKV across several files, we need to keep all the bin files
# open, so we use `ExitStack` to close them all together at the end
with contextlib.ExitStack() as stack:
for bin_file in sorted(bin_files):
print("Processing", bin_file)
hf_weights = stack.enter_context(lazy_load(bin_file))
copy_fn(sd, hf_weights, saver=None, dtype=dtype)
gc.collect()
print("Saving converted checkpoint")
saver.save(sd)
if __name__ == "__main__":
from jsonargparse import CLI
CLI(convert_hf_checkpoint)
import contextlib
import gc
import sys
from functools import partial
from pathlib import Path
from typing import Dict, Literal, Optional, Tuple, Union
from dataclasses import asdict
import json
import torch
# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
from lit_gpt import Config
from lit_gpt.utils import NotYetLoadedTensor, incremental_save, lazy_load
# from scripts.convert_hf_checkpoint import layer_template, load_param
def layer_template(layer_name: str, idx: int) -> Tuple[str, int]:
split = layer_name.split(".")
number = int(split[idx])
split[idx] = "{}"
from_name = ".".join(split)
return from_name, number
def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: Optional[torch.dtype]) -> torch.Tensor:
if hasattr(param, "_load_tensor"):
# support tensors loaded via `lazy_load()`
print(f"Loading {name!r} into RAM")
param = param._load_tensor()
if dtype is not None and type(dtype) is not NotYetLoadedTensor and dtype != param.dtype:
print(f"Converting {name!r} from {param.dtype} to {dtype}")
param = param.to(dtype)
return param
def copy_weights_falcon(
size: Literal["7b", "40b"],
state_dict: Dict[str, torch.Tensor],
lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
saver: Optional[incremental_save] = None,
):
weight_map = {
"transformer.wte.weight": "transformer.word_embeddings.weight",
"transformer.h.{}.attn.attn.weight": "transformer.h.{}.self_attention.query_key_value.weight",
"transformer.h.{}.attn.proj.weight": "transformer.h.{}.self_attention.dense.weight",
"transformer.h.{}.mlp.fc.weight": "transformer.h.{}.mlp.dense_h_to_4h.weight",
"transformer.h.{}.mlp.proj.weight": "transformer.h.{}.mlp.dense_4h_to_h.weight",
"transformer.ln_f.bias": "transformer.ln_f.bias",
"transformer.ln_f.weight": "transformer.ln_f.weight",
"lm_head.weight": "lm_head.weight",
}
# the original model definition is different for each size
if size == "7b":
weight_map.update(
{
"transformer.h.{}.norm_1.bias": "transformer.h.{}.input_layernorm.bias",
"transformer.h.{}.norm_1.weight": "transformer.h.{}.input_layernorm.weight",
}
)
elif size == "40b":
weight_map.update(
{
"transformer.h.{}.norm_1.bias": "transformer.h.{}.ln_attn.bias",
"transformer.h.{}.norm_1.weight": "transformer.h.{}.ln_attn.weight",
"transformer.h.{}.norm_2.bias": "transformer.h.{}.ln_mlp.bias",
"transformer.h.{}.norm_2.weight": "transformer.h.{}.ln_mlp.weight",
}
)
else:
raise NotImplementedError
for name, param in lit_weights.items():
if "transformer.h" in name:
from_name, number = layer_template(name, 2)
to_name = weight_map[from_name].format(number)
else:
to_name = weight_map[name]
param = load_param(param, name, None)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
def copy_weights_gpt_neox(
state_dict: Dict[str, torch.Tensor],
lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
saver: Optional[incremental_save] = None,
) -> None:
weight_map = {
"transformer.wte.weight": "gpt_neox.embed_in.weight",
"transformer.h.{}.norm_1.bias": "gpt_neox.layers.{}.input_layernorm.bias",
"transformer.h.{}.norm_1.weight": "gpt_neox.layers.{}.input_layernorm.weight",
"transformer.h.{}.attn.attn.bias": "gpt_neox.layers.{}.attention.query_key_value.bias",
"transformer.h.{}.attn.attn.weight": "gpt_neox.layers.{}.attention.query_key_value.weight",
"transformer.h.{}.attn.proj.bias": "gpt_neox.layers.{}.attention.dense.bias",
"transformer.h.{}.attn.proj.weight": "gpt_neox.layers.{}.attention.dense.weight",
"transformer.h.{}.norm_2.bias": "gpt_neox.layers.{}.post_attention_layernorm.bias",
"transformer.h.{}.norm_2.weight": "gpt_neox.layers.{}.post_attention_layernorm.weight",
"transformer.h.{}.mlp.fc.bias": "gpt_neox.layers.{}.mlp.dense_h_to_4h.bias",
"transformer.h.{}.mlp.fc.weight": "gpt_neox.layers.{}.mlp.dense_h_to_4h.weight",
"transformer.h.{}.mlp.proj.bias": "gpt_neox.layers.{}.mlp.dense_4h_to_h.bias",
"transformer.h.{}.mlp.proj.weight": "gpt_neox.layers.{}.mlp.dense_4h_to_h.weight",
"transformer.ln_f.bias": "gpt_neox.final_layer_norm.bias",
"transformer.ln_f.weight": "gpt_neox.final_layer_norm.weight",
"lm_head.weight": "embed_out.weight",
}
for name, param in lit_weights.items():
if "transformer.h" in name:
from_name, number = layer_template(name, 2)
to_name = weight_map[from_name].format(number)
else:
to_name = weight_map[name]
param = load_param(param, name, None)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
def copy_weights_llama(
config: Config,
state_dict: Dict[str, torch.Tensor],
lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
saver: Optional[incremental_save] = None,
):
weight_map = {
"transformer.wte.weight": "model.embed_tokens.weight",
"transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
"transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
"transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight",
"transformer.h.{}.mlp.swiglu.w1.weight": "model.layers.{}.mlp.gate_proj.weight",
"transformer.h.{}.mlp.swiglu.w2.weight": "model.layers.{}.mlp.up_proj.weight",
"transformer.h.{}.mlp.swiglu.w3.weight": "model.layers.{}.mlp.down_proj.weight",
"transformer.ln_f.weight": "model.norm.weight",
"lm_head.weight": "lm_head.weight",
}
for name, param in lit_weights.items():
if name.endswith(".attn.attn.weight"):
from_name, number = layer_template(name, 2)
q = "model.layers.{}.self_attn.q_proj.weight".format(number)
k = "model.layers.{}.self_attn.k_proj.weight".format(number)
v = "model.layers.{}.self_attn.v_proj.weight".format(number)
qkv = load_param(param, name,None)
qp, kp, vp = tensor_split(qkv, config)
for to_name, param in zip((q, k, v), (qp, kp, vp)):
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
elif "transformer.h" in name:
from_name, number = layer_template(name, 2)
to_name = weight_map[from_name]
if to_name is None:
continue
to_name = to_name.format(number)
param = load_param(param, name,None)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
else:
to_name = weight_map[name]
param = load_param(param, name, None)
if saver is not None:
param = saver.store_early(param)
state_dict[to_name] = param
def tensor_split(
param: Union[torch.Tensor, NotYetLoadedTensor], config: Config
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
def kstart(start, blen, klen) -> int:
"""returns start index of keys in batch"""
return start + (blen - (klen * 2))
def vstart(start, blen, klen) -> int:
"""returns start index of values in batch"""
return start + blen - klen
def vend(start, blen) -> int:
"""returns last index of values in batch"""
return start + blen
# num observations
nobs = param.shape[0]
# batch length
blen = nobs // config.n_query_groups
# key length in batch
klen = config.head_size
# value length in batch
vlen = config.head_size
# the starting index of each new batch
starts = range(0, nobs, blen)
# the indices to splice on
splices = [(s, kstart(s, blen, klen), vstart(s, blen, vlen), vend(s, blen)) for s in starts]
qc = ()
kc = ()
vc = ()
for splice in splices:
qs, ks, vs, ve = splice
qc += (param[qs:ks, :],)
kc += (param[ks:vs, :],)
vc += (param[vs:ve, :],)
q = torch.cat(qc)
k = torch.cat(kc)
v = torch.cat(vc)
return q, k, v
def maybe_unwrap_state_dict(lit_weights: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
return lit_weights.get("model", lit_weights)
def check_conversion_supported(lit_weights: Dict[str, torch.Tensor]) -> None:
weight_names = {wk.split(".")[-1] for wk in lit_weights}
# LoRA or QLoRA
if any("lora" in wn for wn in weight_names):
raise ValueError("Model weights must be merged using `lora.merge_lora_weights()` before conversion.")
# adapter v2. adapter_bias will only be in adapter_v2
elif "adapter_bias" in weight_names:
raise NotImplementedError("Converting models finetuned with adapter_v2 not yet supported.")
# adapter. gating_factor is in adapter and adapter_v2
elif "gating_factor" in weight_names:
raise NotImplementedError("Converting models finetuned with adapter not yet supported.")
def get_tinyllama_init_hf_config() -> dict:
return {
"architectures": ["LlamaForCausalLM"],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": None,
"initializer_range": 0.02,
"intermediate_size": None,
"max_position_embeddings": None,
"model_type": "llama",
"num_attention_heads": None,
"num_hidden_layers": None,
"num_key_value_heads": None,
"pretraining_tp": 1,
"rms_norm_eps": None,
"rope_scaling": None,
"tie_word_embeddings": False,
"torch_dtype": "float32",
"transformers_version": "4.31.0.dev0",
"use_cache": True,
"vocab_size": None,
}
def convert_config_lit_to_hf(lit_config_dict: dict) -> dict:
lit_hf_mapping = {
"block_size": "max_position_embeddings",
"vocab_size": "vocab_size",
"n_layer": "num_hidden_layers",
"n_embd": "hidden_size",
"n_head": "num_attention_heads",
"n_query_groups": "num_key_value_heads",
"intermediate_size": "intermediate_size",
"norm_eps": "rms_norm_eps",
}
hf_config_dict = get_tinyllama_init_hf_config()
for lit_key, hf_key in lit_hf_mapping.items():
hf_config_dict[hf_key] = lit_config_dict[lit_key]
return hf_config_dict
@torch.inference_mode()
def convert_lit_checkpoint(*,
checkpoint_name: str,
out_dir: Path,
model_name: str,
model_only: bool = True) -> None:
config = Config.from_name(model_name)
if "falcon" in model_name:
copy_fn = partial(copy_weights_falcon, "40b" if config.n_embd == 8192 else "7b")
elif config._mlp_class == "LLaMAMLP":
copy_fn = partial(copy_weights_llama, config)
else:
copy_fn = copy_weights_gpt_neox
# initialize a new empty state dict to hold our new weights
sd = {}
# checkpoint_name cannot be hardcoded because there exists different outputs such as
# ("lit_model_finetuned.pth", "lit_model_lora_finetuned.pth", "lit_model_adapter_finetuned.pth"")
pth_file = out_dir / checkpoint_name
bin_file = pth_file.with_suffix(".bin")
with incremental_save(bin_file) as saver:
with contextlib.ExitStack() as stack:
lit_weights = stack.enter_context(lazy_load(pth_file))
lit_weights = maybe_unwrap_state_dict(lit_weights)
check_conversion_supported(lit_weights)
# Incremental save will trigger error
copy_fn(sd, lit_weights, saver=None)
gc.collect()
saver.save(sd)
# convert lit config file to hf-style
if not model_only:
print('Converting config file...')
lit_config = asdict(config)
hf_config = convert_config_lit_to_hf(lit_config)
config_path = out_dir / "config.json"
with open(config_path, "w") as f:
json.dump(hf_config, f, indent=4)
if __name__ == "__main__":
from jsonargparse import CLI
CLI(convert_lit_checkpoint, as_positional=False)
import glob
import json
import os
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm
# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
import lit_gpt.packed_dataset as packed_dataset
from lit_gpt import Config, Tokenizer
filenames_sample = [
"arxiv_sample.jsonl",
"book_sample.jsonl",
"c4_sample.jsonl",
"cc_2019-30_sample.jsonl",
"cc_2020-05_sample.jsonl",
"cc_2021-04_sample.jsonl",
"cc_2022-05_sample.jsonl",
"cc_2023-06_sample.jsonl",
"github_sample.jsonl",
"stackexchange_sample.jsonl",
"wikipedia_sample.jsonl",
]
filename_sets = {
"arxiv": "arxiv/arxiv*",
"book": "book/book*",
"c4": "c4/c4-train*",
"common_crawl": "common_crawl/*",
"github": "github/filtered*",
"stackexchange": "stackexchange/stackexchange*",
"wikipedia": "wikipedia/wiki*",
}
def prepare_sample(
source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = ""
) -> None:
"""Prepare the "Red Pajama" dataset using the original tokenizer."""
destination_path.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer(checkpoint_dir)
for name in filenames_sample:
if match and match not in name:
continue
filepath = source_path / name
if not filepath.is_file():
raise RuntimeError(
f"Input file not found at {filepath}. \nMake sure you download the data, e.g. wget -i"
" https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through"
" \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T"
" \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n"
)
prefix, _ = os.path.splitext(name)
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=prefix,
chunk_size=chunk_size,
sep_token=tokenizer.eos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
print(f"Processing {name}")
with open(filepath, encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
builder.write_reminder()
def prepare_full(
source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = ""
) -> None:
"""Prepare the "Red Pajama" dataset using the original tokenizer."""
import zstandard as zstd
destination_path.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer(checkpoint_dir)
for set_name, pattern in filename_sets.items():
if match and match not in set_name:
continue
is_cc = set_name == "common_crawl"
filenames = glob.glob(os.path.join(source_path, pattern), recursive=True)
if not filenames:
raise RuntimeError(
f"No files matching {pattern} found at {source_path}. \nMake sure you download the data, e.g. wget -i"
" https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through"
" \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T"
" \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n"
)
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=set_name,
chunk_size=chunk_size,
sep_token=tokenizer.eos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
for name in filenames:
filepath = source_path / name
print(f"Processing {name}")
if is_cc:
with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
else:
with open(filepath, encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
builder.write_reminder()
def prepare(
source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
destination_path: Path = Path("data/redpajama_sample"),
sample: bool = True,
match: str = "",
) -> None:
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained."""
with open(checkpoint_dir / "lit_config.json") as fp:
config = Config(**json.load(fp))
prepare_fn = prepare_sample if sample else prepare_full
prepare_fn(
source_path=source_path,
checkpoint_dir=checkpoint_dir,
destination_path=destination_path,
chunk_size=(config.block_size + 1) * 1024, # block size + 1 for causal, 1024 blocks
match=match,
)
if __name__ == "__main__":
from jsonargparse import CLI
CLI(prepare)
\ No newline at end of file
import json
import glob
import os
from pathlib import Path
import sys
from typing import List
import numpy as np
from tqdm import tqdm
from multiprocessing import Process, cpu_count
# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
import lit_gpt.packed_dataset as packed_dataset
from lit_gpt import Tokenizer
# Filename for SlimPajama
slimpajama_sets = {
"train": "train/chunk*/*",
"validation": "validation/chunk*/*",
"test": "test/chunk*/*",
}
def prepare_full(
source_path: Path,
tokenizer_path: Path,
destination_path: Path,
chunk_size: int,
split: str="train",
filenames_subset: List[str] = None,
process_id: int = 0
) -> None:
import zstandard as zstd
destination_path.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer(tokenizer_path)
# Use the provided filenames_subset or default to all filenames
filenames = filenames_subset
if not filenames:
raise RuntimeError(
f"No files matching {slimpajama_sets[split]} found at {source_path}. \n"
"Make sure you download the data..."
)
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=f"{split}_slimpajama_{process_id}", # Use process_id to differentiate builders
chunk_size=chunk_size,
sep_token=tokenizer.bos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
for filepath in filenames:
print(f"Processing {filepath}")
with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
for row in tqdm(f):
text = json.loads(row)["text"]
if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
continue # we don't want to include the github data
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
# we throw away the final corpus to avoid meaningless corpus filled with bos_ids, see https://github.com/jzhang38/TinyLlama/issues/83 for more details
# builder.write_reminder()
def prepare(
source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
destination_path: Path = Path("data/red_pajama_sample"),
chunk_size: int = 2049 * 1024,
split: str="train",
percentage: float = 1.0,
) -> None:
import time
filenames = glob.glob(os.path.join(source_path, slimpajama_sets[split]), recursive=True)
filenames = filenames[:int(len(filenames) * percentage)]
num_processes = cpu_count()
chunked_filenames = np.array_split(filenames, num_processes)
processes = []
start_time = time.time()
for i, subset in enumerate(chunked_filenames):
p = Process(target=prepare_full, args=(source_path, tokenizer_path, destination_path, chunk_size, split, list(subset), i))
processes.append(p)
p.start()
for p in processes:
p.join()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")
if __name__ == "__main__":
from jsonargparse import CLI
CLI(prepare)
\ No newline at end of file
import json
import glob
import os
from pathlib import Path
import sys
from typing import List
import numpy as np
from tqdm import tqdm
from multiprocessing import Process, cpu_count
# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))
import lit_gpt.packed_dataset as packed_dataset
from lit_gpt import Tokenizer
import pandas as pd
def prepare_full(
source_path: Path,
tokenizer_path: Path,
destination_path: Path,
chunk_size: int,
split: str="train",
filenames_subset: List[str] = None,
process_id: int = 0
) -> None:
import zstandard as zstd
destination_path.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer(tokenizer_path)
# Use the provided filenames_subset or default to all filenames
filenames = filenames_subset
if not filenames:
raise RuntimeError(
f"No files matching found at {source_path}. \n"
"Make sure you download the data..."
)
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=f"{split}_starcoder_{process_id}", # Use process_id to differentiate builders
chunk_size=chunk_size,
sep_token=tokenizer.bos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
for filepath in filenames:
print(f"Processing {filepath}")
try:
contents = pd.read_parquet(filepath, engine='pyarrow')['content']
except:
print(f"Error reading {filepath}!!")
continue
for text in contents:
text_ids = tokenizer.encode(text)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
# we throw away the final corpus to avoid meaningless corpus filled with bos_ids, see https://github.com/jzhang38/TinyLlama/issues/83 for more details
# builder.write_reminder()
def prepare(
source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
destination_path: Path = Path("data/red_pajama_sample"),
chunk_size: int = 2049 * 1024,
split: str="train",
percentage: float = 1.0,
filenames_subset: List[str] = None,
) -> None:
import time
assert split == "train" # starcoder only has train data
filenames = glob.glob(os.path.join(source_path, "*/*.parquet"), recursive=True)
# only retrain subsets that follow the prefix in filenames_subset
if filenames_subset:
filenames = [f for f in filenames if any([prefix in f for prefix in filenames_subset])]
filenames = filenames[:int(len(filenames) * percentage)]
num_processes = 64
chunked_filenames = np.array_split(filenames, num_processes)
processes = []
start_time = time.time()
for i, subset in enumerate(chunked_filenames):
p = Process(target=prepare_full, args=(source_path, tokenizer_path, destination_path, chunk_size, split, list(subset), i))
processes.append(p)
p.start()
for p in processes:
p.join()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")
if __name__ == "__main__":
from jsonargparse import CLI
CLI(prepare)
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from collections import defaultdict
import copy
import json
import os
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence
import numpy as np
from tqdm import tqdm
import logging
import pandas as pd
import importlib
from packaging import version
from packaging.version import parse
import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import argparse
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
set_seed,
Seq2SeqTrainer,
BitsAndBytesConfig,
LlamaTokenizer
)
from datasets import load_dataset, Dataset
import evaluate
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = True
logger = logging.getLogger(__name__)
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(
default="EleutherAI/pythia-12b"
)
trust_remote_code: Optional[bool] = field(
default=False,
metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
)
@dataclass
class DataArguments:
eval_dataset_size: int = field(
default=1024, metadata={"help": "Size of validation dataset."}
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
source_max_len: int = field(
default=1024,
metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."},
)
target_max_len: int = field(
default=256,
metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."},
)
dataset: str = field(
default='alpaca',
metadata={"help": "Which dataset to finetune on. See datamodule for options."}
)
dataset_format: Optional[str] = field(
default=None,
metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"}
)
@dataclass
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
train_on_source: Optional[bool] = field(
default=False,
metadata={"help": "Whether to train on the input in addition to the target text."}
)
report_to: str = field(
default='none',
metadata={"help": "To use wandb or something else for reporting."}
)
output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'})
optim: str = field(default='adamw_torch', metadata={"help": 'The optimizer to be used'})
per_device_train_batch_size: int = field(default=16, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
gradient_accumulation_steps: int = field(default=1, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'})
learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'})
do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'})
lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})
@dataclass
class GenerationArguments:
# For more hyperparameters check:
# https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
# Length arguments
max_new_tokens: Optional[int] = field(
default=256,
metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops"
"if predict_with_generate is set."}
)
min_new_tokens : Optional[int] = field(
default=None,
metadata={"help": "Minimum number of new tokens to generate."}
)
# Generation strategy
do_sample: Optional[bool] = field(default=False)
num_beams: Optional[int] = field(default=1)
num_beam_groups: Optional[int] = field(default=1)
penalty_alpha: Optional[float] = field(default=None)
use_cache: Optional[bool] = field(default=True)
# Hyperparameters for logit manipulation
temperature: Optional[float] = field(default=1.0)
top_k: Optional[int] = field(default=50)
top_p: Optional[float] = field(default=1.0)
typical_p: Optional[float] = field(default=1.0)
diversity_penalty: Optional[float] = field(default=0.0)
repetition_penalty: Optional[float] = field(default=1.0)
length_penalty: Optional[float] = field(default=1.0)
no_repeat_ngram_size: Optional[int] = field(default=0)
def get_accelerate_model(args, checkpoint_dir):
device_map = "auto"
# if we are in a distributed setting, we need to set the device map and max memory per device
if os.environ.get('LOCAL_RANK') is not None:
local_rank = int(os.environ.get('LOCAL_RANK', '0'))
device_map = {'': local_rank}
print(f'loading base model {args.model_name_or_path}...')
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
device_map=device_map,
trust_remote_code=args.trust_remote_code,
torch_dtype=torch.float16,# error: loss=0
# torch_dtype=torch.bfloat16,# if use fp16
)
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
padding_side="right",
use_fast=True, # Fast tokenizer giving issues.
trust_remote_code=args.trust_remote_code,
)
if tokenizer._pad_token is None:
special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
if args.dataset == "OpenAssistant/oasst_top1_2023-08-25":
chat_special_tokens = ["<|im_start|>", "<|im_end|>"]
special_tokens_dict.update(additional_special_tokens=chat_special_tokens)
smart_tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model
)
return model, tokenizer
def print_trainable_parameters(args, model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || "
f"all params: {all_param} || "
)
def smart_tokenizer_and_embedding_resize(
special_tokens_dict: Dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
non_special_tokens = None,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + tokenizer.add_tokens(non_special_tokens)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings_data = model.get_input_embeddings().weight.data
output_embeddings_data = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
output_embeddings_data[-num_new_tokens:] = output_embeddings_avg
print(f"Resized tokenizer and embedding to {len(tokenizer)} tokens.")
@dataclass
class DataCollatorForCausalLM(object):
tokenizer: transformers.PreTrainedTokenizer
source_max_len: int
target_max_len: int
train_on_source: bool
predict_with_generate: bool
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
# Extract elements
sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
# Tokenize
tokenized_sources_with_prompt = self.tokenizer(
sources,
max_length=self.source_max_len,
truncation=True,
add_special_tokens=False,
)
tokenized_targets = self.tokenizer(
targets,
max_length=self.target_max_len,
truncation=True,
add_special_tokens=False,
)
# Build the input and labels for causal LM
input_ids = []
labels = []
for tokenized_source, tokenized_target in zip(
tokenized_sources_with_prompt['input_ids'],
tokenized_targets['input_ids']
):
if not self.predict_with_generate:
input_ids.append(torch.tensor(tokenized_source + tokenized_target))
if not self.train_on_source:
labels.append(
torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
)
else:
labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
else:
input_ids.append(torch.tensor(tokenized_source))
# Apply padding
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
data_dict = {
'input_ids': input_ids,
'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
}
if labels is not None:
data_dict['labels'] = labels
return data_dict
def extract_unnatural_instructions_data(examples, extract_reformulations=False):
out = {
'input': [],
'output': [],
}
for example_instances in examples['instances']:
for instance in example_instances:
out['input'].append(instance['instruction_with_input'])
out['output'].append(instance['output'])
if extract_reformulations:
for example_reformulations in examples['reformulations']:
if example_reformulations is not None:
for instance in example_reformulations:
out['input'].append(instance['instruction_with_input'])
out['output'].append(instance['output'])
return out
ALPACA_PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response: "
),
}
def extract_alpaca_dataset(example):
if example.get("input", "") != "":
prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
else:
prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
return {'input': prompt_format.format(**example)}
def local_dataset(dataset_name):
if dataset_name.endswith('.json') or dataset_name.endswith('.jsonl'):
full_dataset = Dataset.from_json(path_or_paths=dataset_name)
elif dataset_name.endswith('.csv'):
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
elif dataset_name.endswith('.tsv'):
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
else:
raise ValueError(f"Unsupported dataset format: {dataset_name}")
split_dataset = full_dataset.train_test_split(test_size=0.1)
return split_dataset
def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
"""
Make dataset and collator for supervised fine-tuning.
Datasets are expected to have the following columns: { `input`, `output` }
Available datasets to be selected with `dataset` argument:
- alpaca, 52002 examples
- alpaca cleaned, 51942 examples
- chip2 (OIG), 210289 examples
- self-instruct, 82612 examples
- hh-rlhf (Anthropic), 160800 examples
- longform, 23.7k examples
- oasst1 (OpenAssistant) primary message tree only, 9,846 examples
Coming soon:
- unnatural instructions core, 66010 examples
- unnatural instructions full, 240670 examples
- alpaca-gpt4, 52002 examples
- unnatural-instructions-gpt4, 9000 examples
- supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used)
- flan (FLAN v2), up to 20M examples available
- vicuna
"""
def load_data(dataset_name):
if dataset_name == 'alpaca':
return load_dataset("tatsu-lab/alpaca")
elif dataset_name == 'alpaca-clean':
return load_dataset("yahma/alpaca-cleaned")
elif dataset_name == 'chip2':
return load_dataset("laion/OIG", data_files='unified_chip2.jsonl')
elif dataset_name == 'hh-rlhf':
return load_dataset("Anthropic/hh-rlhf")
elif dataset_name == 'longform':
return load_dataset("akoksal/LongForm")
elif dataset_name == 'oasst1':
return load_dataset("timdettmers/openassistant-guanaco")
elif dataset_name == "OpenAssistant/oasst_top1_2023-08-25":
return load_dataset("OpenAssistant/oasst_top1_2023-08-25")
elif dataset_name == 'vicuna':
raise NotImplementedError("Vicuna data was not released.")
else:
if os.path.exists(dataset_name):
try:
args.dataset_format = args.dataset_format if args.dataset_format else "input-output"
full_dataset = local_dataset(dataset_name)
return full_dataset
except:
raise ValueError(f"Error loading dataset from {dataset_name}")
else:
raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")
def format_dataset(dataset, dataset_format):
if (
dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or
(dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean'])
):
dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction'])
elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'):
dataset = dataset.map(lambda x: {
'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''),
'output': x['text'].split('\n<bot>: ')[1],
})
elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'):
for old, new in [["prompt", "input"], ["completion", "output"]]:
dataset = dataset.rename_column(old, new)
elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
dataset = dataset.map(lambda x: {
'input': '',
'output': x['chosen']
})
elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'):
dataset = dataset.map(lambda x: {
'input': '',
'output': x['text'],
})
elif dataset_format == 'input-output':
# leave as is
pass
# Remove unused columns.
dataset = dataset.remove_columns(
[col for col in dataset.column_names['train'] if col not in ['input', 'output']]
)
return dataset
# Load dataset.
dataset = load_data(args.dataset)
dataset = format_dataset(dataset, args.dataset_format)
# Split train/eval, reduce size
if args.do_eval or args.do_predict:
if 'eval' in dataset:
eval_dataset = dataset['eval']
else:
print('Splitting train dataset in train and validation according to `eval_dataset_size`')
dataset = dataset["train"].train_test_split(
test_size=args.eval_dataset_size, shuffle=True, seed=42
)
eval_dataset = dataset['test']
if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
eval_dataset = eval_dataset.select(range(args.max_eval_samples))
if args.group_by_length:
eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
if args.do_train:
train_dataset = dataset['train']
if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
train_dataset = train_dataset.select(range(args.max_train_samples))
if args.group_by_length:
train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
data_collator = DataCollatorForCausalLM(
tokenizer=tokenizer,
source_max_len=args.source_max_len,
target_max_len=args.target_max_len,
train_on_source=args.train_on_source,
predict_with_generate=args.predict_with_generate,
)
return dict(
train_dataset=train_dataset if args.do_train else None,
eval_dataset=eval_dataset if args.do_eval else None,
predict_dataset=eval_dataset if args.do_predict else None,
data_collator=data_collator
)
def get_last_checkpoint(checkpoint_dir):
if isdir(checkpoint_dir):
is_completed = exists(join(checkpoint_dir, 'completed'))
if is_completed: return None, True # already finished
max_step = 0
for filename in os.listdir(checkpoint_dir):
if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'):
max_step = max(max_step, int(filename.replace('checkpoint-', '')))
if max_step == 0: return None, is_completed # training started, but no checkpoint
checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}')
print(f"Found a previous checkpoint at: {checkpoint_dir}")
return checkpoint_dir, is_completed # checkpoint found!
return None, False # first training
def train():
hfparser = transformers.HfArgumentParser((
ModelArguments, DataArguments, TrainingArguments, GenerationArguments
))
model_args, data_args, training_args, generation_args, extra_args = \
hfparser.parse_args_into_dataclasses(return_remaining_strings=True)
training_args.generation_config = transformers.GenerationConfig(**vars(generation_args))
args = argparse.Namespace(
**vars(model_args), **vars(data_args), **vars(training_args)
)
print(args)
checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir)
if completed_training:
print('Detected that training was already completed!')
model, tokenizer = get_accelerate_model(args, checkpoint_dir)
model.config.use_cache = False
print('loaded model')
set_seed(args.seed)
data_module = make_data_module(tokenizer=tokenizer, args=args)
trainer = Seq2SeqTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
**{k:v for k,v in data_module.items() if k != 'predict_dataset'},
)
# Verifying the datatypes and parameter counts before training.
print_trainable_parameters(args, model)
dtypes = {}
for _, p in model.named_parameters():
dtype = p.dtype
if dtype not in dtypes: dtypes[dtype] = 0
dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items(): total+= v
for k, v in dtypes.items():
print(k, v, v/total)
all_metrics = {"run_name": args.run_name}
# Training
if args.do_train:
logger.info("*** Train ***")
# Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF.
# Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not.
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
all_metrics.update(metrics)
# Evaluation
if args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate(metric_key_prefix="eval")
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
all_metrics.update(metrics)
# Prediction
if args.do_predict:
logger.info("*** Predict ***")
prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict")
prediction_metrics = prediction_output.metrics
predictions = prediction_output.predictions
predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
predictions = tokenizer.batch_decode(
predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
for i, example in enumerate(data_module['predict_dataset']):
example['prediction_with_input'] = predictions[i].strip()
example['prediction'] = predictions[i].replace(example['input'], '').strip()
fout.write(json.dumps(example) + '\n')
print(prediction_metrics)
trainer.log_metrics("predict", prediction_metrics)
trainer.save_metrics("predict", prediction_metrics)
all_metrics.update(prediction_metrics)
if (args.do_train or args.do_eval or args.do_predict):
with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout:
fout.write(json.dumps(all_metrics))
if __name__ == "__main__":
train()
from transformers import AutoTokenizer
import transformers
import torch
import time
# model = "PY007/TinyLlama-1.1B-Chat-v0.1"
# model = "PY007/TinyLlama-1.1B-intermediate-step-240k-503b"
model = "output/503B_FT_lr1e-5_ep5/checkpoint-2920"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
)
prompt = "Give me detailed info about Jeo Biden."
formatted_prompt = (
f"### Human: {prompt} ### Assistant:"
)
start_time = time.time()
sequences = pipeline(
formatted_prompt,
do_sample=True,
top_k=50,
top_p = 0.9,
num_return_sequences=1,
repetition_penalty=1.1,
max_new_tokens=1024,
)
print("infer time:", time.time() - start_time, "s")
for seq in sequences:
print(f"Result: {seq['generated_text']}")
# We include a simple full-parameter finetuning & inference script here. Our V0.1 chat model is finetuned using this script.
# The FT dataset we use is openassistant-guanaco. For finetuning with less than 4GB RAM, we refer you to the Qlora and bitsandbytes repo.
# We did not undergone extensive hyperparameter tuning nor choosing more performant FT datasets.
# We hope the community can explore on finetuning TinyLlama and come up with better chat models. I will include community-finetuned models in this repo.
# V0.1
# CUDA_VISIBLE_DEVICES=0 accelerate launch --main_process_port 1234 sft/finetune.py \
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 sft/finetune.py \
--model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-240k-503b \
--output_dir ./output/503B_FT_lr1e-5_ep5 \
--logging_steps 10 \
--save_strategy epoch \
--data_seed 42 \
--save_total_limit 6 \
--evaluation_strategy epoch \
--eval_dataset_size 512 \
--max_eval_samples 1000 \
--per_device_eval_batch_size 1 \
--max_new_tokens 32 \
--dataloader_num_workers 3 \
--group_by_length=False \
--logging_strategy steps \
--remove_unused_columns False \
--do_train \
--do_eval \
--warmup_ratio 0.05 \
--lr_scheduler_type constant \
--dataset oasst1 \
--source_max_len 16 \
--target_max_len 512 \
--per_device_train_batch_size 4 \
--max_steps 0 \
--num_train_epochs 5 \
--learning_rate 1e-5 \
--adam_beta2 0.999 \
--max_grad_norm 1.0 \
--weight_decay 0.0 \
--seed 0 \
--trust_remote_code \
--report_to wandb
# # V0.2
# CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 sft/finetune.py \
# --model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-480k-1T \
# --output_dir ./output/503B_FT_lr1e-5_ep5_top1_2023-08-25 \
# --logging_steps 10 \
# --save_strategy epoch \
# --data_seed 42 \
# --save_total_limit 6 \
# --evaluation_strategy epoch \
# --eval_dataset_size 512 \
# --max_eval_samples 1000 \
# --per_device_eval_batch_size 1 \
# --max_new_tokens 32 \
# --dataloader_num_workers 3 \
# --group_by_length=False \
# --logging_strategy steps \
# --remove_unused_columns False \
# --do_train \
# --do_eval \
# --warmup_ratio 0.05 \
# --lr_scheduler_type constant \
# --dataset OpenAssistant/oasst_top1_2023-08-25 \
# --dataset_format oasst1 \
# --source_max_len 16 \
# --target_max_len 512 \
# --per_device_train_batch_size 4 \
# --max_steps 0 \
# --num_train_epochs 5 \
# --learning_rate 1e-5 \
# --adam_beta2 0.999 \
# --max_grad_norm 1.0 \
# --weight_decay 0.0 \
# --seed 0 \
# --trust_remote_code \
# --report_to wandb
from transformers import AutoTokenizer
import transformers
import torch
model = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
)
prompt = "Give me detailed info about Jeo Biden."
formatted_prompt = (
f"### Human: {prompt} ### Assistant:"
)
sequences = pipeline(
formatted_prompt,
do_sample=True,
top_k=50,
top_p = 0.9,
num_return_sequences=1,
repetition_penalty=1.1,
max_new_tokens=1024,
)
for seq in sequences:
print(f"Result: {seq['generated_text']}")
from transformers import AutoTokenizer
import transformers
import torch
model = "PY007/TinyLlama-1.1B-Chat-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
"text-generation",
model=model,
torch_dtype=torch.float16,
device_map="auto",
)
prompt = "How to get in a good university?"
formatted_prompt = (
f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
)
sequences = pipeline(
formatted_prompt,
do_sample=True,
top_k=50,
top_p = 0.9,
num_return_sequences=1,
repetition_penalty=1.1,
max_new_tokens=1024,
)
for seq in sequences:
print(f"Result: {seq['generated_text']}")
\ No newline at end of file
## Speculative Decoding
### HuggingFace "Assisted Generation"
| Large Model | Native Decoding | Assisted Decoding |
| ----------- | --------------- | ------------------ |
| guanaco-7b | 69 seconds | 38 seconds |
| guanaco-13b | 84 seconds | 45 seconds |
| guanaco-33b | 109 seconds | 62 seconds |
We use PY007/TinyLlama-1.1B-Chat-v0.1 as the assistant model and vary the large model from guanaco-7B to 33B. Experiments are done on a single A40 GPU with code inside instruct_hf_assisted_decoding.py. TinyLlama is loaded in fp16 and the large models are loaded in 8 bit to make guanaco-33b fit in memory and also to keep a consistent setup. The prompt used is "Give me detailed info about Jeo Biden.". max_new_tokens is set to 512.
You can read this [article](https://huggingface.co/blog/assisted-generation) for more information about HuggingFace's Assisted Generation.
Quote from HF: "due to INT8 quantization and the use of causal masking in assisted generation, the output of greedy decoding may differ in rare occasions."
#### TODO
- [ ] Thouroughly benchmark the average speedup on 52K Alpaca prompts.
### Llama.cpp Speculative Decoding
We have continue-pretrained a code tinyllama from the 500B checkpoint with another 7B Python data [here](https://huggingface.co/PY007/TinyLlama-1.1B-python-v0.1).
The code for continue-pretraining can be found in pretrain/tinyllama_code.py
```
./speculative \
-m models/CodeLlama-7b-hf/ggml-model-f16.gguf \
-md models/TinyLlama-1.1B-500B-python/ggml-model-q4_0.gguf \
-p "# Quick-sort implementation in Python and sample usage:" \
-e -ngl 1 -t 4 -n 256 -s 20 --temp 0 --draft 8
```
This gives:
```
encoded 12 tokens in 0.247 seconds, speed: 48.638 t/s
decoded 265 tokens in 7.909 seconds, speed: 33.507 t/s
n_draft = 16
n_predict = 265
n_drafted = 317
n_accept = 195
accept = 61.514%
draft:
llama_print_timings: load time = 53.14 ms
llama_print_timings: sample time = 652.62 ms / 1 runs ( 652.62 ms per token, 1.53 tokens per second)
llama_print_timings: prompt eval time = 73.81 ms / 12 tokens ( 6.15 ms per token, 162.58 tokens per second)
llama_print_timings: eval time = 2247.77 ms / 378 runs ( 5.95 ms per token, 168.17 tokens per second)
llama_print_timings: total time = 8154.92 ms
target:
llama_print_timings: load time = 534.47 ms
llama_print_timings: sample time = 208.12 ms / 265 runs ( 0.79 ms per token, 1273.32 tokens per second)
llama_print_timings: prompt eval time = 4210.38 ms / 382 tokens ( 11.02 ms per token, 90.73 tokens per second)
llama_print_timings: eval time = 682.80 ms / 16 runs ( 42.68 ms per token, 23.43 tokens per second)
llama_print_timings: total time = 8214.11 ms
ggml_metal_free: deallocating
ggml_metal_free: deallocating
```
Even though the model is continue-pretrained exclusively on Python, it retains its ability in other languages, such as C:
```
./speculative \
-m models/CodeLlama-7b-hf/ggml-model-f16.gguf \
-md models/TinyLlama-1.1B-500B-python/ggml-model-q4_0.gguf \
-p "// Quick-sort implementation in C (4 spaces indentation + detailed comments) and sample usage:\n\n#include" \
-e -ngl 1 -t 4 -n 256 -s 20 --temp 0 --draft 8
```
This gives:
```
encoded 25 tokens in 0.278 seconds, speed: 89.900 t/s
decoded 258 tokens in 6.432 seconds, speed: 40.112 t/s
n_draft = 28
n_predict = 258
n_drafted = 278
n_accept = 200
accept = 71.942%
draft:
llama_print_timings: load time = 932.54 ms
llama_print_timings: sample time = 583.50 ms / 1 runs ( 583.50 ms per token, 1.71 tokens per second)
llama_print_timings: prompt eval time = 81.50 ms / 25 tokens ( 3.26 ms per token, 306.73 tokens per second)
llama_print_timings: eval time = 1834.67 ms / 329 runs ( 5.58 ms per token, 179.32 tokens per second)
llama_print_timings: total time = 6710.30 ms
target:
llama_print_timings: load time = 18568.44 ms
llama_print_timings: sample time = 208.78 ms / 258 runs ( 0.81 ms per token, 1235.75 tokens per second)
llama_print_timings: prompt eval time = 3164.84 ms / 342 tokens ( 9.25 ms per token, 108.06 tokens per second)
llama_print_timings: eval time = 775.43 ms / 18 runs ( 43.08 ms per token, 23.21 tokens per second)
llama_print_timings: total time = 7650.67 ms
ggml_metal_free: deallocating
ggml_metal_free: deallocating
```
I have not tried 13B CodeLlama as the large model yet because my Mac memory is not enough :).
\ No newline at end of file
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
model_id = "huggyllama/llama-13b"
peft_model_id = "timdettmers/guanaco-13b"
assistant_checkpoint = "PY007/TinyLlama-1.1B-Chat-v0.1"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me detailed info about Jeo Biden."
formatted_prompt = f"### Human: {prompt}### Assistant:"
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
model.load_adapter(peft_model_id)
print("Large model loaded")
model.config.use_cache = True
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint).half().to(device)
assistant_model.config.use_cache = True
print("Small model loaded")
print("###Native Decoding Starts...\n")
start = time.time()
outputs = model.generate(**inputs, assistant_model=None, max_new_tokens=512)
end = time.time()
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
print("Time: ", end - start)
print("###TinyLlama Assisted Decoding Starts...\n")
start = time.time()
outputs = model.generate(**inputs, assistant_model=assistant_model,max_new_tokens=512)
end = time.time()
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
# print time in seconds
print("Time: ", end - start)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment