v1.0

ccfcffb1 · chenzk · ccfcffb1 · ccfcffb1 · ccfcffb1 · ccfcffb1
Commit ccfcffb1 authored Feb 28, 2024 by chenzk
16 changed files
--- a/script.sh
+++ b/script.sh
+python scripts/convert_hf_checkpoint.py --checkpoint_dir  out/TinyLlama-1.1B-900B --model_name tiny_LLaMA_1b
+python test_weight.py --checkpoint_dir out/TinyLlama-1.1B-intermediate-900B
+python pretrain/tinyllama_code.py --devices 8 --train_data_dir data/code_specialist_python_java_javascript_c_go_8192
+python scripts/prepare_starcoder.py --source_path data/starcoderdata/ --tokenizer_path data/llama --destination_path data/code_specialist_python_java_javascript_c_go_8192 --split train --percentage 1.0 --filenames_subset ["python","cpp","go","java","javascript"] --chunk_size 4194816
+/data/TinyLlama/out/code_tiny_LLaMA_1b_python_java_go_cpp_javascript/iter-032000-ckpt.pth
+python scripts/convert_lit_checkpoint.py --out_dir /data/TinyLlama/out/tiny_LLaMA_1b/ --checkpoint_name iter-100000-ckpt.pth --model_name tiny_LLaMA_1b
\ No newline at end of file
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
+import contextlib
+import gc
+import json
+import sys
+from functools import partial
+from pathlib import Path
+from typing import Dict, List, Literal, Optional, Tuple, Union
+import torch
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from lit_gpt import Config
+from lit_gpt.utils import NotYetLoadedTensor, incremental_save, lazy_load
+def copy_weights_gpt_neox(
+    state_dict: Dict[str, torch.Tensor],
+    hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> None:
+    weight_map = {
+        "gpt_neox.embed_in.weight": "transformer.wte.weight",
+        "gpt_neox.layers.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias",
+        "gpt_neox.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "gpt_neox.layers.{}.attention.query_key_value.bias": "transformer.h.{}.attn.attn.bias",
+        "gpt_neox.layers.{}.attention.query_key_value.weight": "transformer.h.{}.attn.attn.weight",
+        "gpt_neox.layers.{}.attention.dense.bias": "transformer.h.{}.attn.proj.bias",
+        "gpt_neox.layers.{}.attention.dense.weight": "transformer.h.{}.attn.proj.weight",
+        "gpt_neox.layers.{}.attention.rotary_emb.inv_freq": None,
+        "gpt_neox.layers.{}.attention.bias": None,
+        "gpt_neox.layers.{}.attention.masked_bias": None,
+        "gpt_neox.layers.{}.post_attention_layernorm.bias": "transformer.h.{}.norm_2.bias",
+        "gpt_neox.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
+        "gpt_neox.layers.{}.mlp.dense_h_to_4h.bias": "transformer.h.{}.mlp.fc.bias",
+        "gpt_neox.layers.{}.mlp.dense_h_to_4h.weight": "transformer.h.{}.mlp.fc.weight",
+        "gpt_neox.layers.{}.mlp.dense_4h_to_h.bias": "transformer.h.{}.mlp.proj.bias",
+        "gpt_neox.layers.{}.mlp.dense_4h_to_h.weight": "transformer.h.{}.mlp.proj.weight",
+        "gpt_neox.final_layer_norm.bias": "transformer.ln_f.bias",
+        "gpt_neox.final_layer_norm.weight": "transformer.ln_f.weight",
+        "embed_out.weight": "lm_head.weight",
+    }
+    for name, param in hf_weights.items():
+        if "gpt_neox.layers" in name:
+            from_name, number = layer_template(name, 2)
+            to_name = weight_map[from_name]
+            if to_name is None:
+                continue
+            to_name = to_name.format(number)
+        else:
+            to_name = weight_map[name]
+        param = load_param(param, name, dtype)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+def copy_weights_falcon(
+    size: Literal["7b", "40b"],
+    state_dict: Dict[str, torch.Tensor],
+    hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> None:
+    weight_map = {
+        "transformer.word_embeddings.weight": "transformer.wte.weight",
+        "transformer.h.{}.self_attention.query_key_value.weight": "transformer.h.{}.attn.attn.weight",
+        "transformer.h.{}.self_attention.dense.weight": "transformer.h.{}.attn.proj.weight",
+        "transformer.h.{}.mlp.dense_h_to_4h.weight": "transformer.h.{}.mlp.fc.weight",
+        "transformer.h.{}.mlp.dense_4h_to_h.weight": "transformer.h.{}.mlp.proj.weight",
+        "transformer.ln_f.bias": "transformer.ln_f.bias",
+        "transformer.ln_f.weight": "transformer.ln_f.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+    # the original model definition is different for each size
+    if size == "7b":
+        weight_map.update(
+            {
+                "transformer.h.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias",
+                "transformer.h.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+            }
+        )
+    elif size == "40b":
+        weight_map.update(
+            {
+                "transformer.h.{}.ln_attn.bias": "transformer.h.{}.norm_1.bias",
+                "transformer.h.{}.ln_attn.weight": "transformer.h.{}.norm_1.weight",
+                "transformer.h.{}.ln_mlp.bias": "transformer.h.{}.norm_2.bias",
+                "transformer.h.{}.ln_mlp.weight": "transformer.h.{}.norm_2.weight",
+            }
+        )
+    else:
+        raise NotImplementedError
+    for name, param in hf_weights.items():
+        if "transformer.h" in name:
+            from_name, number = layer_template(name, 2)
+            to_name = weight_map[from_name].format(number)
+        else:
+            to_name = weight_map[name]
+        param = load_param(param, name, dtype)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+def copy_weights_hf_llama(
+    config: Config,
+    qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
+    state_dict: Dict[str, torch.Tensor],
+    hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> None:
+    weight_map = {
+        "model.embed_tokens.weight": "transformer.wte.weight",
+        "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "model.layers.{}.self_attn.q_proj.weight": None,
+        "model.layers.{}.self_attn.k_proj.weight": None,
+        "model.layers.{}.self_attn.v_proj.weight": None,
+        "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
+        "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
+        "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
+        "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.swiglu.w1.weight",
+        "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.swiglu.w2.weight",
+        "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.swiglu.w3.weight",
+        "model.norm.weight": "transformer.ln_f.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+    for name, param in hf_weights.items():
+        if "model.layers" in name:
+            from_name, number = layer_template(name, 2)
+            qkv = qkv_weights.setdefault(number, [None, None, None])
+            if "q_proj" in name:
+                qkv[0] = param
+            elif "k_proj" in name:
+                qkv[1] = param
+            elif "v_proj" in name:
+                qkv[2] = param
+            to_name = weight_map[from_name]
+            if to_name is None:
+                continue
+            to_name = to_name.format(number)
+        else:
+            to_name = weight_map[name]
+        param = load_param(param, name, dtype)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+    for i, (q, k, v) in list(qkv_weights.items()):
+        if q is None or k is None or v is None:
+            # split across different .bin files
+            continue
+        q = load_param(q, f"layer {i} q", dtype)
+        k = load_param(k, f"layer {i} k", dtype)
+        v = load_param(v, f"layer {i} v", dtype)
+        q_per_kv = config.n_head // config.n_query_groups
+        qs = torch.split(q, config.head_size * q_per_kv)
+        ks = torch.split(k, config.head_size)
+        vs = torch.split(v, config.head_size)
+        cycled = [t for group in zip(qs, ks, vs) for t in group]
+        qkv = torch.cat(cycled)
+        state_dict[f"transformer.h.{i}.attn.attn.weight"] = qkv
+        del qkv_weights[i]
+def layer_template(layer_name: str, idx: int) -> Tuple[str, int]:
+    split = layer_name.split(".")
+    number = int(split[idx])
+    split[idx] = "{}"
+    from_name = ".".join(split)
+    return from_name, number
+def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: Optional[torch.dtype]) -> torch.Tensor:
+    if hasattr(param, "_load_tensor"):
+        # support tensors loaded via `lazy_load()`
+        print(f"Loading {name!r} into RAM")
+        param = param._load_tensor()
+    if dtype is not None and type(dtype) is not NotYetLoadedTensor and dtype != param.dtype:
+        print(f"Converting {name!r} from {param.dtype} to {dtype}")
+        param = param.to(dtype)
+    return param
+@torch.inference_mode()
+def convert_hf_checkpoint(
+    *,
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    model_name: Optional[str] = None,
+    dtype: Optional[str] = None,
+) -> None:
+    if model_name is None:
+        model_name = checkpoint_dir.name
+    if dtype is not None:
+        dtype = getattr(torch, dtype)
+    config = Config.from_name(model_name)
+    print(f"Model config {config.__dict__}")
+    with open(checkpoint_dir / "lit_config.json", "w") as json_config:
+        json.dump(config.__dict__, json_config)
+    if "falcon" in model_name:
+        copy_fn = partial(copy_weights_falcon, "40b" if config.n_embd == 8192 else "7b")
+    elif config._mlp_class == "LLaMAMLP":
+        # holder to reconstitute the split q, k, v
+        qkv_weights = {}
+        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
+    else:
+        copy_fn = copy_weights_gpt_neox
+    # initialize a new empty state dict to hold our new weights
+    sd = {}
+    # Load the json file containing weight mapping
+    pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json"
+    if pytorch_bin_map_json_path.is_file():  # not all checkpoints have this file
+        with open(pytorch_bin_map_json_path) as json_map:
+            bin_index = json.load(json_map)
+        bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+    else:
+        bin_files = set(checkpoint_dir.glob("*.bin"))
+    if not bin_files:
+        raise ValueError(f"Expected {str(checkpoint_dir)!r} to contain .bin files")
+    with incremental_save(checkpoint_dir / "lit_model.pth") as saver:
+        # for checkpoints that split the QKV across several files, we need to keep all the bin files
+        # open, so we use `ExitStack` to close them all together at the end
+        with contextlib.ExitStack() as stack:
+            for bin_file in sorted(bin_files):
+                print("Processing", bin_file)
+                hf_weights = stack.enter_context(lazy_load(bin_file))
+                copy_fn(sd, hf_weights, saver=None, dtype=dtype)
+            gc.collect()
+        print("Saving converted checkpoint")
+        saver.save(sd)
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    CLI(convert_hf_checkpoint)
--- a/scripts/convert_lit_checkpoint.py
+++ b/scripts/convert_lit_checkpoint.py
+import contextlib
+import gc
+import sys
+from functools import partial
+from pathlib import Path
+from typing import Dict, Literal, Optional, Tuple, Union
+from dataclasses import asdict
+import json
+import torch
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from lit_gpt import Config
+from lit_gpt.utils import NotYetLoadedTensor, incremental_save, lazy_load
+# from scripts.convert_hf_checkpoint import layer_template, load_param
+def layer_template(layer_name: str, idx: int) -> Tuple[str, int]:
+    split = layer_name.split(".")
+    number = int(split[idx])
+    split[idx] = "{}"
+    from_name = ".".join(split)
+    return from_name, number
+def load_param(param: Union[torch.Tensor, NotYetLoadedTensor], name: str, dtype: Optional[torch.dtype]) -> torch.Tensor:
+    if hasattr(param, "_load_tensor"):
+        # support tensors loaded via `lazy_load()`
+        print(f"Loading {name!r} into RAM")
+        param = param._load_tensor()
+    if dtype is not None and type(dtype) is not NotYetLoadedTensor and dtype != param.dtype:
+        print(f"Converting {name!r} from {param.dtype} to {dtype}")
+        param = param.to(dtype)
+    return param
+def copy_weights_falcon(
+    size: Literal["7b", "40b"],
+    state_dict: Dict[str, torch.Tensor],
+    lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+):
+    weight_map = {
+        "transformer.wte.weight": "transformer.word_embeddings.weight",
+        "transformer.h.{}.attn.attn.weight": "transformer.h.{}.self_attention.query_key_value.weight",
+        "transformer.h.{}.attn.proj.weight": "transformer.h.{}.self_attention.dense.weight",
+        "transformer.h.{}.mlp.fc.weight": "transformer.h.{}.mlp.dense_h_to_4h.weight",
+        "transformer.h.{}.mlp.proj.weight": "transformer.h.{}.mlp.dense_4h_to_h.weight",
+        "transformer.ln_f.bias": "transformer.ln_f.bias",
+        "transformer.ln_f.weight": "transformer.ln_f.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+    # the original model definition is different for each size
+    if size == "7b":
+        weight_map.update(
+            {
+                "transformer.h.{}.norm_1.bias": "transformer.h.{}.input_layernorm.bias",
+                "transformer.h.{}.norm_1.weight": "transformer.h.{}.input_layernorm.weight",
+            }
+        )
+    elif size == "40b":
+        weight_map.update(
+            {
+                "transformer.h.{}.norm_1.bias": "transformer.h.{}.ln_attn.bias",
+                "transformer.h.{}.norm_1.weight": "transformer.h.{}.ln_attn.weight",
+                "transformer.h.{}.norm_2.bias": "transformer.h.{}.ln_mlp.bias",
+                "transformer.h.{}.norm_2.weight": "transformer.h.{}.ln_mlp.weight",
+            }
+        )
+    else:
+        raise NotImplementedError
+    for name, param in lit_weights.items():
+        if "transformer.h" in name:
+            from_name, number = layer_template(name, 2)
+            to_name = weight_map[from_name].format(number)
+        else:
+            to_name = weight_map[name]
+        param = load_param(param, name, None)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+def copy_weights_gpt_neox(
+    state_dict: Dict[str, torch.Tensor],
+    lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+) -> None:
+    weight_map = {
+        "transformer.wte.weight": "gpt_neox.embed_in.weight",
+        "transformer.h.{}.norm_1.bias": "gpt_neox.layers.{}.input_layernorm.bias",
+        "transformer.h.{}.norm_1.weight": "gpt_neox.layers.{}.input_layernorm.weight",
+        "transformer.h.{}.attn.attn.bias": "gpt_neox.layers.{}.attention.query_key_value.bias",
+        "transformer.h.{}.attn.attn.weight": "gpt_neox.layers.{}.attention.query_key_value.weight",
+        "transformer.h.{}.attn.proj.bias": "gpt_neox.layers.{}.attention.dense.bias",
+        "transformer.h.{}.attn.proj.weight": "gpt_neox.layers.{}.attention.dense.weight",
+        "transformer.h.{}.norm_2.bias": "gpt_neox.layers.{}.post_attention_layernorm.bias",
+        "transformer.h.{}.norm_2.weight": "gpt_neox.layers.{}.post_attention_layernorm.weight",
+        "transformer.h.{}.mlp.fc.bias": "gpt_neox.layers.{}.mlp.dense_h_to_4h.bias",
+        "transformer.h.{}.mlp.fc.weight": "gpt_neox.layers.{}.mlp.dense_h_to_4h.weight",
+        "transformer.h.{}.mlp.proj.bias": "gpt_neox.layers.{}.mlp.dense_4h_to_h.bias",
+        "transformer.h.{}.mlp.proj.weight": "gpt_neox.layers.{}.mlp.dense_4h_to_h.weight",
+        "transformer.ln_f.bias": "gpt_neox.final_layer_norm.bias",
+        "transformer.ln_f.weight": "gpt_neox.final_layer_norm.weight",
+        "lm_head.weight": "embed_out.weight",
+    }
+    for name, param in lit_weights.items():
+        if "transformer.h" in name:
+            from_name, number = layer_template(name, 2)
+            to_name = weight_map[from_name].format(number)
+        else:
+            to_name = weight_map[name]
+        param = load_param(param, name, None)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+def copy_weights_llama(
+    config: Config,
+    state_dict: Dict[str, torch.Tensor],
+    lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+):
+    weight_map = {
+        "transformer.wte.weight": "model.embed_tokens.weight",
+        "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
+        "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
+        "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight",
+        "transformer.h.{}.mlp.swiglu.w1.weight": "model.layers.{}.mlp.gate_proj.weight",
+        "transformer.h.{}.mlp.swiglu.w2.weight": "model.layers.{}.mlp.up_proj.weight",
+        "transformer.h.{}.mlp.swiglu.w3.weight": "model.layers.{}.mlp.down_proj.weight",
+        "transformer.ln_f.weight": "model.norm.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+    for name, param in lit_weights.items():
+        if name.endswith(".attn.attn.weight"):
+            from_name, number = layer_template(name, 2)
+            q = "model.layers.{}.self_attn.q_proj.weight".format(number)
+            k = "model.layers.{}.self_attn.k_proj.weight".format(number)
+            v = "model.layers.{}.self_attn.v_proj.weight".format(number)
+            qkv = load_param(param, name,None)
+            qp, kp, vp = tensor_split(qkv, config)
+            for to_name, param in zip((q, k, v), (qp, kp, vp)):
+                if saver is not None:
+                    param = saver.store_early(param)
+                state_dict[to_name] = param
+        elif "transformer.h" in name:
+            from_name, number = layer_template(name, 2)
+            to_name = weight_map[from_name]
+            if to_name is None:
+                continue
+            to_name = to_name.format(number)
+            param = load_param(param, name,None)
+            if saver is not None:
+                param = saver.store_early(param)
+            state_dict[to_name] = param
+        else:
+            to_name = weight_map[name]
+            param = load_param(param, name, None)
+            if saver is not None:
+                param = saver.store_early(param)
+            state_dict[to_name] = param
+def tensor_split(
+    param: Union[torch.Tensor, NotYetLoadedTensor], config: Config
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def kstart(start, blen, klen) -> int:
+        """returns start index of keys in batch"""
+        return start + (blen - (klen * 2))
+    def vstart(start, blen, klen) -> int:
+        """returns start index of values in batch"""
+        return start + blen - klen
+    def vend(start, blen) -> int:
+        """returns last index of values in batch"""
+        return start + blen
+    # num observations
+    nobs = param.shape[0]
+    # batch length
+    blen = nobs // config.n_query_groups
+    # key length in batch
+    klen = config.head_size
+    # value length in batch
+    vlen = config.head_size
+    # the starting index of each new batch
+    starts = range(0, nobs, blen)
+    # the indices to splice on
+    splices = [(s, kstart(s, blen, klen), vstart(s, blen, vlen), vend(s, blen)) for s in starts]
+    qc = ()
+    kc = ()
+    vc = ()
+    for splice in splices:
+        qs, ks, vs, ve = splice
+        qc += (param[qs:ks, :],)
+        kc += (param[ks:vs, :],)
+        vc += (param[vs:ve, :],)
+    q = torch.cat(qc)
+    k = torch.cat(kc)
+    v = torch.cat(vc)
+    return q, k, v
+def maybe_unwrap_state_dict(lit_weights: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    return lit_weights.get("model", lit_weights)
+def check_conversion_supported(lit_weights: Dict[str, torch.Tensor]) -> None:
+    weight_names = {wk.split(".")[-1] for wk in lit_weights}
+    # LoRA or QLoRA
+    if any("lora" in wn for wn in weight_names):
+        raise ValueError("Model weights must be merged using `lora.merge_lora_weights()` before conversion.")
+    # adapter v2. adapter_bias will only be in adapter_v2
+    elif "adapter_bias" in weight_names:
+        raise NotImplementedError("Converting models finetuned with adapter_v2 not yet supported.")
+    # adapter. gating_factor is in adapter and adapter_v2
+    elif "gating_factor" in weight_names:
+        raise NotImplementedError("Converting models finetuned with adapter not yet supported.")
+def get_tinyllama_init_hf_config() -> dict:
+    return {
+        "architectures": ["LlamaForCausalLM"],
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": None,
+        "initializer_range": 0.02,
+        "intermediate_size": None,
+        "max_position_embeddings": None,
+        "model_type": "llama",
+        "num_attention_heads": None,
+        "num_hidden_layers": None,
+        "num_key_value_heads": None,
+        "pretraining_tp": 1,
+        "rms_norm_eps": None,
+        "rope_scaling": None,
+        "tie_word_embeddings": False,
+        "torch_dtype": "float32",
+        "transformers_version": "4.31.0.dev0",
+        "use_cache": True,
+        "vocab_size": None,
+    }
+def convert_config_lit_to_hf(lit_config_dict: dict) -> dict:
+    lit_hf_mapping = {
+        "block_size": "max_position_embeddings",
+        "vocab_size": "vocab_size",
+        "n_layer": "num_hidden_layers",
+        "n_embd": "hidden_size",
+        "n_head": "num_attention_heads",
+        "n_query_groups": "num_key_value_heads",
+        "intermediate_size": "intermediate_size",
+        "norm_eps": "rms_norm_eps",
+    }
+    hf_config_dict = get_tinyllama_init_hf_config()
+    for lit_key, hf_key in lit_hf_mapping.items():
+        hf_config_dict[hf_key] = lit_config_dict[lit_key]
+    return hf_config_dict
+@torch.inference_mode()
+def convert_lit_checkpoint(*, 
+    checkpoint_name: str, 
+    out_dir: Path, 
+    model_name: str,
+    model_only: bool = True) -> None:
+    config = Config.from_name(model_name)
+    if "falcon" in model_name:
+        copy_fn = partial(copy_weights_falcon, "40b" if config.n_embd == 8192 else "7b")
+    elif config._mlp_class == "LLaMAMLP":
+        copy_fn = partial(copy_weights_llama, config)
+    else:
+        copy_fn = copy_weights_gpt_neox
+    # initialize a new empty state dict to hold our new weights
+    sd = {}
+    # checkpoint_name cannot be hardcoded because there exists different outputs such as
+    # ("lit_model_finetuned.pth", "lit_model_lora_finetuned.pth", "lit_model_adapter_finetuned.pth"")
+    pth_file = out_dir / checkpoint_name
+    bin_file = pth_file.with_suffix(".bin")
+    with incremental_save(bin_file) as saver:
+        with contextlib.ExitStack() as stack:
+            lit_weights = stack.enter_context(lazy_load(pth_file))
+            lit_weights = maybe_unwrap_state_dict(lit_weights)
+            check_conversion_supported(lit_weights)
+            # Incremental save will trigger error
+            copy_fn(sd, lit_weights, saver=None)
+            gc.collect()
+        saver.save(sd)
+    # convert lit config file to hf-style
+    if not model_only:
+        print('Converting config file...')
+        lit_config = asdict(config)
+        hf_config = convert_config_lit_to_hf(lit_config)
+        config_path = out_dir / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(hf_config, f, indent=4)
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    CLI(convert_lit_checkpoint, as_positional=False)
--- a/scripts/prepare_redpajama.py
+++ b/scripts/prepare_redpajama.py
+import glob
+import json
+import os
+import sys
+from pathlib import Path
+import numpy as np
+from tqdm import tqdm
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+import lit_gpt.packed_dataset as packed_dataset
+from lit_gpt import Config, Tokenizer
+filenames_sample = [
+    "arxiv_sample.jsonl",
+    "book_sample.jsonl",
+    "c4_sample.jsonl",
+    "cc_2019-30_sample.jsonl",
+    "cc_2020-05_sample.jsonl",
+    "cc_2021-04_sample.jsonl",
+    "cc_2022-05_sample.jsonl",
+    "cc_2023-06_sample.jsonl",
+    "github_sample.jsonl",
+    "stackexchange_sample.jsonl",
+    "wikipedia_sample.jsonl",
+]
+filename_sets = {
+    "arxiv": "arxiv/arxiv*",
+    "book": "book/book*",
+    "c4": "c4/c4-train*",
+    "common_crawl": "common_crawl/*",
+    "github": "github/filtered*",
+    "stackexchange": "stackexchange/stackexchange*",
+    "wikipedia": "wikipedia/wiki*",
+}
+def prepare_sample(
+    source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = ""
+) -> None:
+    """Prepare the "Red Pajama" dataset using the original tokenizer."""
+    destination_path.mkdir(parents=True, exist_ok=True)
+    tokenizer = Tokenizer(checkpoint_dir)
+    for name in filenames_sample:
+        if match and match not in name:
+            continue
+        filepath = source_path / name
+        if not filepath.is_file():
+            raise RuntimeError(
+                f"Input file not found at {filepath}. \nMake sure you download the data, e.g. wget -i"
+                " https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through"
+                " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T"
+                " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n"
+            )
+        prefix, _ = os.path.splitext(name)
+        builder = packed_dataset.PackedDatasetBuilder(
+            outdir=destination_path,
+            prefix=prefix,
+            chunk_size=chunk_size,
+            sep_token=tokenizer.eos_id,
+            dtype="auto",
+            vocab_size=tokenizer.vocab_size,
+        )
+        print(f"Processing {name}")
+        with open(filepath, encoding="utf-8") as f:
+            for row in tqdm(f):
+                text = json.loads(row)["text"]
+                text_ids = tokenizer.encode(text)
+                builder.add_array(np.array(text_ids, dtype=builder.dtype))
+        builder.write_reminder()
+def prepare_full(
+    source_path: Path, checkpoint_dir: Path, destination_path: Path, chunk_size: int, match: str = ""
+) -> None:
+    """Prepare the "Red Pajama" dataset using the original tokenizer."""
+    import zstandard as zstd
+    destination_path.mkdir(parents=True, exist_ok=True)
+    tokenizer = Tokenizer(checkpoint_dir)
+    for set_name, pattern in filename_sets.items():
+        if match and match not in set_name:
+            continue
+        is_cc = set_name == "common_crawl"
+        filenames = glob.glob(os.path.join(source_path, pattern), recursive=True)
+        if not filenames:
+            raise RuntimeError(
+                f"No files matching {pattern} found at {source_path}. \nMake sure you download the data, e.g. wget -i"
+                " https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through"
+                " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T"
+                " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n"
+            )
+        builder = packed_dataset.PackedDatasetBuilder(
+            outdir=destination_path,
+            prefix=set_name,
+            chunk_size=chunk_size,
+            sep_token=tokenizer.eos_id,
+            dtype="auto",
+            vocab_size=tokenizer.vocab_size,
+        )
+        for name in filenames:
+            filepath = source_path / name
+            print(f"Processing {name}")
+            if is_cc:
+                with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+                    for row in tqdm(f):
+                        text = json.loads(row)["text"]
+                        text_ids = tokenizer.encode(text)
+                        builder.add_array(np.array(text_ids, dtype=builder.dtype))
+            else:
+                with open(filepath, encoding="utf-8") as f:
+                    for row in tqdm(f):
+                        text = json.loads(row)["text"]
+                        text_ids = tokenizer.encode(text)
+                        builder.add_array(np.array(text_ids, dtype=builder.dtype))
+        builder.write_reminder()
+def prepare(
+    source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    destination_path: Path = Path("data/redpajama_sample"),
+    sample: bool = True,
+    match: str = "",
+) -> None:
+    """Prepare the "Red Pajama" dataset. We assume tokenizer has been trained."""
+    with open(checkpoint_dir / "lit_config.json") as fp:
+        config = Config(**json.load(fp))
+    prepare_fn = prepare_sample if sample else prepare_full
+    prepare_fn(
+        source_path=source_path,
+        checkpoint_dir=checkpoint_dir,
+        destination_path=destination_path,
+        chunk_size=(config.block_size + 1) * 1024,  # block size + 1 for causal, 1024 blocks
+        match=match,
+    )
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    CLI(prepare)
\ No newline at end of file
--- a/scripts/prepare_slimpajama.py
+++ b/scripts/prepare_slimpajama.py
+import json
+import glob
+import os
+from pathlib import Path
+import sys
+from typing import List
+import numpy as np
+from tqdm import tqdm
+from multiprocessing import Process, cpu_count
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+import lit_gpt.packed_dataset as packed_dataset
+from lit_gpt import Tokenizer
+# Filename for SlimPajama
+slimpajama_sets = {
+    "train": "train/chunk*/*",
+    "validation": "validation/chunk*/*",
+    "test": "test/chunk*/*",
+}
+def prepare_full(
+    source_path: Path,
+    tokenizer_path: Path,
+    destination_path: Path,
+    chunk_size: int,
+    split: str="train",
+    filenames_subset: List[str] = None,
+    process_id: int = 0
+) -> None:
+    import zstandard as zstd
+    destination_path.mkdir(parents=True, exist_ok=True)
+    tokenizer = Tokenizer(tokenizer_path)
+    # Use the provided filenames_subset or default to all filenames
+    filenames = filenames_subset 
+    if not filenames:
+        raise RuntimeError(
+            f"No files matching {slimpajama_sets[split]} found at {source_path}. \n"
+            "Make sure you download the data..."
+        )
+    builder = packed_dataset.PackedDatasetBuilder(
+        outdir=destination_path,
+        prefix=f"{split}_slimpajama_{process_id}",  # Use process_id to differentiate builders
+        chunk_size=chunk_size,
+        sep_token=tokenizer.bos_id,
+        dtype="auto",
+        vocab_size=tokenizer.vocab_size,
+    )
+    for filepath in filenames:
+        print(f"Processing {filepath}")
+        with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+            for row in tqdm(f):
+                text = json.loads(row)["text"]
+                if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
+                    continue # we don't want to include the github data
+                text_ids = tokenizer.encode(text)
+                builder.add_array(np.array(text_ids, dtype=builder.dtype))
+    # we throw away the final corpus to avoid meaningless corpus filled with bos_ids, see https://github.com/jzhang38/TinyLlama/issues/83 for more details
+    # builder.write_reminder()
+def prepare(
+    source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
+    tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
+    destination_path: Path = Path("data/red_pajama_sample"),
+    chunk_size: int = 2049 * 1024,
+    split: str="train",
+    percentage: float = 1.0,
+) -> None:
+    import time
+    filenames = glob.glob(os.path.join(source_path, slimpajama_sets[split]), recursive=True)
+    filenames = filenames[:int(len(filenames) * percentage)]
+    num_processes = cpu_count() 
+    chunked_filenames = np.array_split(filenames, num_processes)
+    processes = []
+    start_time = time.time()
+    for i, subset in enumerate(chunked_filenames):
+        p = Process(target=prepare_full, args=(source_path, tokenizer_path, destination_path, chunk_size, split, list(subset), i))
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print(f"Time taken: {elapsed_time:.2f} seconds")
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    CLI(prepare)
\ No newline at end of file
--- a/scripts/prepare_starcoder.py
+++ b/scripts/prepare_starcoder.py
+import json
+import glob
+import os
+from pathlib import Path
+import sys
+from typing import List
+import numpy as np
+from tqdm import tqdm
+from multiprocessing import Process, cpu_count
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+import lit_gpt.packed_dataset as packed_dataset
+from lit_gpt import Tokenizer
+import pandas as pd
+def prepare_full(
+    source_path: Path,
+    tokenizer_path: Path,
+    destination_path: Path,
+    chunk_size: int,
+    split: str="train",
+    filenames_subset: List[str] = None,
+    process_id: int = 0
+) -> None:
+    import zstandard as zstd
+    destination_path.mkdir(parents=True, exist_ok=True)
+    tokenizer = Tokenizer(tokenizer_path)
+    # Use the provided filenames_subset or default to all filenames
+    filenames = filenames_subset 
+    if not filenames:
+        raise RuntimeError(
+            f"No files matching  found at {source_path}. \n"
+            "Make sure you download the data..."
+        )
+    builder = packed_dataset.PackedDatasetBuilder(
+        outdir=destination_path,
+        prefix=f"{split}_starcoder_{process_id}",  # Use process_id to differentiate builders
+        chunk_size=chunk_size,
+        sep_token=tokenizer.bos_id,
+        dtype="auto",
+        vocab_size=tokenizer.vocab_size,
+    )
+    for filepath in filenames:
+        print(f"Processing {filepath}")
+        try:
+            contents = pd.read_parquet(filepath, engine='pyarrow')['content']
+        except:
+            print(f"Error reading {filepath}!!")
+            continue
+        for text in contents:
+            text_ids = tokenizer.encode(text)
+            builder.add_array(np.array(text_ids, dtype=builder.dtype))
+    # we throw away the final corpus to avoid meaningless corpus filled with bos_ids, see https://github.com/jzhang38/TinyLlama/issues/83 for more details
+    # builder.write_reminder()
+def prepare(
+    source_path: Path = Path("data/RedPajama-Data-1T-Sample"),
+    tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
+    destination_path: Path = Path("data/red_pajama_sample"),
+    chunk_size: int = 2049 * 1024,
+    split: str="train",
+    percentage: float = 1.0,
+    filenames_subset: List[str] = None,
+) -> None:
+    import time
+    assert split == "train" #  starcoder only has train data
+    filenames = glob.glob(os.path.join(source_path, "*/*.parquet"), recursive=True)
+    # only retrain subsets that follow the prefix in filenames_subset
+    if filenames_subset:
+        filenames = [f for f in filenames if any([prefix in f for prefix in filenames_subset])]
+    filenames = filenames[:int(len(filenames) * percentage)]
+    num_processes = 64
+    chunked_filenames = np.array_split(filenames, num_processes)
+    processes = []
+    start_time = time.time()
+    for i, subset in enumerate(chunked_filenames):
+        p = Process(target=prepare_full, args=(source_path, tokenizer_path, destination_path, chunk_size, split, list(subset), i))
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print(f"Time taken: {elapsed_time:.2f} seconds")
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    CLI(prepare)
--- a/sft/finetune.py
+++ b/sft/finetune.py
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+import copy
+import json
+import os
+from os.path import exists, join, isdir
+from dataclasses import dataclass, field
+import sys
+from typing import Optional, Dict, Sequence
+import numpy as np
+from tqdm import tqdm
+import logging
+import pandas as pd
+import importlib
+from packaging import version
+from packaging.version import parse
+import torch
+import transformers
+from torch.nn.utils.rnn import pad_sequence
+import argparse
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    set_seed,
+    Seq2SeqTrainer,
+    BitsAndBytesConfig,
+    LlamaTokenizer
+)
+from datasets import load_dataset, Dataset
+import evaluate
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+if torch.cuda.is_available():   
+    torch.backends.cuda.matmul.allow_tf32 = True
+logger = logging.getLogger(__name__)
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(
+        default="EleutherAI/pythia-12b"
+    )
+    trust_remote_code: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
+    )
+@dataclass
+class DataArguments:
+    eval_dataset_size: int = field(
+        default=1024, metadata={"help": "Size of validation dataset."}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    source_max_len: int = field(
+        default=1024,
+        metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    target_max_len: int = field(
+        default=256,
+        metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    dataset: str = field(
+        default='alpaca',
+        metadata={"help": "Which dataset to finetune on. See datamodule for options."}
+    )
+    dataset_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"}
+    )
+@dataclass
+class TrainingArguments(transformers.Seq2SeqTrainingArguments):
+    train_on_source: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to train on the input in addition to the target text."}
+    )
+    report_to: str = field(
+        default='none',
+        metadata={"help": "To use wandb or something else for reporting."}
+    )
+    output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'})
+    optim: str = field(default='adamw_torch', metadata={"help": 'The optimizer to be used'})
+    per_device_train_batch_size: int = field(default=16, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
+    gradient_accumulation_steps: int = field(default=1, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
+    max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
+    weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'})
+    learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
+    remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
+    max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
+    gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'})
+    do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'})
+    lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
+    warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
+    logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
+    group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
+    save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
+    save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
+    save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})
+@dataclass
+class GenerationArguments:
+    # For more hyperparameters check:
+    # https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+    # Length arguments
+    max_new_tokens: Optional[int] = field(
+        default=256,
+        metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops"
+                          "if predict_with_generate is set."}
+    )
+    min_new_tokens : Optional[int] = field(
+        default=None,
+        metadata={"help": "Minimum number of new tokens to generate."}
+    )
+    # Generation strategy
+    do_sample: Optional[bool] = field(default=False)
+    num_beams: Optional[int] = field(default=1)
+    num_beam_groups: Optional[int] = field(default=1)
+    penalty_alpha: Optional[float] = field(default=None)
+    use_cache: Optional[bool] = field(default=True)
+    # Hyperparameters for logit manipulation
+    temperature: Optional[float] = field(default=1.0)
+    top_k: Optional[int] = field(default=50)
+    top_p: Optional[float] = field(default=1.0)
+    typical_p: Optional[float] = field(default=1.0)
+    diversity_penalty: Optional[float] = field(default=0.0)
+    repetition_penalty: Optional[float] = field(default=1.0)
+    length_penalty: Optional[float] = field(default=1.0)
+    no_repeat_ngram_size: Optional[int] = field(default=0)
+def get_accelerate_model(args, checkpoint_dir):
+    device_map = "auto"
+    # if we are in a distributed setting, we need to set the device map and max memory per device
+    if os.environ.get('LOCAL_RANK') is not None:
+        local_rank = int(os.environ.get('LOCAL_RANK', '0'))
+        device_map = {'': local_rank}
+    print(f'loading base model {args.model_name_or_path}...')
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        device_map=device_map,
+        trust_remote_code=args.trust_remote_code,
+        torch_dtype=torch.float16,# error: loss=0
+        # torch_dtype=torch.bfloat16,# if use fp16
+    )
+    # Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        padding_side="right",
+        use_fast=True, # Fast tokenizer giving issues.
+        trust_remote_code=args.trust_remote_code,
+    )
+    if tokenizer._pad_token is None:
+        special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
+        if args.dataset == "OpenAssistant/oasst_top1_2023-08-25":
+            chat_special_tokens = ["<|im_start|>", "<|im_end|>"]
+            special_tokens_dict.update(additional_special_tokens=chat_special_tokens)
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=special_tokens_dict,
+            tokenizer=tokenizer,
+            model=model
+        )
+    return model, tokenizer
+def print_trainable_parameters(args, model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || "
+        f"all params: {all_param} || "
+    )
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+    non_special_tokens = None,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + tokenizer.add_tokens(non_special_tokens)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings_data = model.get_input_embeddings().weight.data
+        output_embeddings_data = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg
+    print(f"Resized tokenizer and embedding to {len(tokenizer)} tokens.")
+@dataclass
+class DataCollatorForCausalLM(object):
+    tokenizer: transformers.PreTrainedTokenizer
+    source_max_len: int
+    target_max_len: int
+    train_on_source: bool
+    predict_with_generate: bool
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        # Extract elements
+        sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
+        targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
+        # Tokenize
+        tokenized_sources_with_prompt = self.tokenizer(
+            sources,
+            max_length=self.source_max_len,
+            truncation=True,
+            add_special_tokens=False,
+        )
+        tokenized_targets = self.tokenizer(
+            targets,
+            max_length=self.target_max_len,
+            truncation=True,
+            add_special_tokens=False,
+        )
+        # Build the input and labels for causal LM
+        input_ids = []
+        labels = []
+        for tokenized_source, tokenized_target in zip(
+            tokenized_sources_with_prompt['input_ids'],
+            tokenized_targets['input_ids']
+        ):
+            if not self.predict_with_generate:
+                input_ids.append(torch.tensor(tokenized_source + tokenized_target))
+                if not self.train_on_source:
+                    labels.append(
+                        torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
+                    )
+                else:
+                    labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
+            else:
+                input_ids.append(torch.tensor(tokenized_source))
+        # Apply padding
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
+        data_dict = {
+            'input_ids': input_ids,
+            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
+        }
+        if labels is not None:
+            data_dict['labels'] = labels
+        return data_dict
+def extract_unnatural_instructions_data(examples, extract_reformulations=False):
+    out = {
+        'input': [],
+        'output': [],
+    }
+    for example_instances in examples['instances']:
+        for instance in example_instances:
+            out['input'].append(instance['instruction_with_input'])
+            out['output'].append(instance['output'])
+    if extract_reformulations:
+        for example_reformulations in examples['reformulations']:
+            if example_reformulations is not None:
+                for instance in example_reformulations:
+                    out['input'].append(instance['instruction_with_input'])
+                    out['output'].append(instance['output'])
+    return out
+ALPACA_PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response: "
+    ),
+}
+def extract_alpaca_dataset(example):
+    if example.get("input", "") != "":
+        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
+    else:
+        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
+    return {'input': prompt_format.format(**example)}
+def local_dataset(dataset_name):
+    if dataset_name.endswith('.json') or dataset_name.endswith('.jsonl'):
+        full_dataset = Dataset.from_json(path_or_paths=dataset_name)
+    elif dataset_name.endswith('.csv'):
+        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
+    elif dataset_name.endswith('.tsv'):
+        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
+    else:
+        raise ValueError(f"Unsupported dataset format: {dataset_name}")
+    split_dataset = full_dataset.train_test_split(test_size=0.1)
+    return split_dataset
+def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
+    """
+    Make dataset and collator for supervised fine-tuning.
+    Datasets are expected to have the following columns: { `input`, `output` }
+    Available datasets to be selected with `dataset` argument:
+        - alpaca, 52002 examples
+        - alpaca cleaned, 51942 examples
+        - chip2 (OIG), 210289 examples
+        - self-instruct, 82612 examples
+        - hh-rlhf (Anthropic), 160800 examples
+        - longform, 23.7k examples
+        - oasst1 (OpenAssistant) primary message tree only, 9,846 examples
+    Coming soon:
+        - unnatural instructions core, 66010 examples
+        - unnatural instructions full, 240670 examples
+        - alpaca-gpt4, 52002 examples
+        - unnatural-instructions-gpt4, 9000 examples
+        - supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used)
+        - flan (FLAN v2), up to 20M examples available
+        - vicuna
+    """
+    def load_data(dataset_name):
+        if dataset_name == 'alpaca':
+            return load_dataset("tatsu-lab/alpaca")
+        elif dataset_name == 'alpaca-clean':
+            return load_dataset("yahma/alpaca-cleaned")
+        elif dataset_name == 'chip2':
+            return load_dataset("laion/OIG", data_files='unified_chip2.jsonl')
+        elif dataset_name == 'hh-rlhf':
+            return load_dataset("Anthropic/hh-rlhf")
+        elif dataset_name == 'longform':
+            return load_dataset("akoksal/LongForm")
+        elif dataset_name == 'oasst1':
+            return load_dataset("timdettmers/openassistant-guanaco")
+        elif dataset_name == "OpenAssistant/oasst_top1_2023-08-25":
+            return load_dataset("OpenAssistant/oasst_top1_2023-08-25")
+        elif dataset_name == 'vicuna':
+            raise NotImplementedError("Vicuna data was not released.")
+        else:
+            if os.path.exists(dataset_name):
+                try:
+                    args.dataset_format = args.dataset_format if args.dataset_format else "input-output"
+                    full_dataset = local_dataset(dataset_name)
+                    return full_dataset
+                except:
+                    raise ValueError(f"Error loading dataset from {dataset_name}")
+            else:
+                raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")
+    def format_dataset(dataset, dataset_format):
+        if (
+            dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or
+            (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean'])
+        ):
+            dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction'])
+        elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'):
+            dataset = dataset.map(lambda x: {
+                'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''),
+                'output': x['text'].split('\n<bot>: ')[1],
+            })
+        elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'):
+            for old, new in [["prompt", "input"], ["completion", "output"]]:
+                dataset = dataset.rename_column(old, new)
+        elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
+            dataset = dataset.map(lambda x: {
+                'input': '',
+                'output': x['chosen']
+            })
+        elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'):
+            dataset = dataset.map(lambda x: {
+                'input': '',
+                'output': x['text'],
+            })
+        elif dataset_format == 'input-output':
+            # leave as is
+            pass
+        # Remove unused columns.
+        dataset = dataset.remove_columns(
+            [col for col in dataset.column_names['train'] if col not in ['input', 'output']]
+        )
+        return dataset
+     # Load dataset.
+    dataset = load_data(args.dataset)
+    dataset = format_dataset(dataset, args.dataset_format)
+    # Split train/eval, reduce size
+    if args.do_eval or args.do_predict:
+        if 'eval' in dataset:
+            eval_dataset = dataset['eval']
+        else:
+            print('Splitting train dataset in train and validation according to `eval_dataset_size`')
+            dataset = dataset["train"].train_test_split(
+                test_size=args.eval_dataset_size, shuffle=True, seed=42
+            )
+            eval_dataset = dataset['test']
+        if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
+            eval_dataset = eval_dataset.select(range(args.max_eval_samples))
+        if args.group_by_length:
+            eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
+    if args.do_train:
+        train_dataset = dataset['train']
+        if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
+            train_dataset = train_dataset.select(range(args.max_train_samples))
+        if args.group_by_length:
+            train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
+    data_collator = DataCollatorForCausalLM(
+        tokenizer=tokenizer,
+        source_max_len=args.source_max_len,
+        target_max_len=args.target_max_len,
+        train_on_source=args.train_on_source,
+        predict_with_generate=args.predict_with_generate,
+    )
+    return dict(
+        train_dataset=train_dataset if args.do_train else None,
+        eval_dataset=eval_dataset if args.do_eval else None,
+        predict_dataset=eval_dataset if args.do_predict else None,
+        data_collator=data_collator
+    )
+def get_last_checkpoint(checkpoint_dir):
+    if isdir(checkpoint_dir):
+        is_completed = exists(join(checkpoint_dir, 'completed'))
+        if is_completed: return None, True # already finished
+        max_step = 0
+        for filename in os.listdir(checkpoint_dir):
+            if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'):
+                max_step = max(max_step, int(filename.replace('checkpoint-', '')))
+        if max_step == 0: return None, is_completed # training started, but no checkpoint
+        checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}')
+        print(f"Found a previous checkpoint at: {checkpoint_dir}")
+        return checkpoint_dir, is_completed # checkpoint found!
+    return None, False # first training
+def train():
+    hfparser = transformers.HfArgumentParser((
+        ModelArguments, DataArguments, TrainingArguments, GenerationArguments
+    ))
+    model_args, data_args, training_args, generation_args, extra_args = \
+        hfparser.parse_args_into_dataclasses(return_remaining_strings=True)
+    training_args.generation_config = transformers.GenerationConfig(**vars(generation_args))
+    args = argparse.Namespace(
+        **vars(model_args), **vars(data_args), **vars(training_args)
+    )
+    print(args)
+    checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir)
+    if completed_training:
+        print('Detected that training was already completed!')
+    model, tokenizer = get_accelerate_model(args, checkpoint_dir)
+    model.config.use_cache = False
+    print('loaded model')
+    set_seed(args.seed)
+    data_module = make_data_module(tokenizer=tokenizer, args=args)
+    trainer = Seq2SeqTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
+    )
+    # Verifying the datatypes and parameter counts before training.
+    print_trainable_parameters(args, model)
+    dtypes = {}
+    for _, p in model.named_parameters():
+        dtype = p.dtype
+        if dtype not in dtypes: dtypes[dtype] = 0
+        dtypes[dtype] += p.numel()
+    total = 0
+    for k, v in dtypes.items(): total+= v
+    for k, v in dtypes.items():
+        print(k, v, v/total)
+    all_metrics = {"run_name": args.run_name}
+    # Training
+    if args.do_train:
+        logger.info("*** Train ***")
+        # Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF.
+        # Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not.
+        train_result = trainer.train()
+        metrics = train_result.metrics
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+        all_metrics.update(metrics)
+    # Evaluation
+    if args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+        all_metrics.update(metrics)
+    # Prediction
+    if args.do_predict:
+        logger.info("*** Predict ***")
+        prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict")
+        prediction_metrics = prediction_output.metrics
+        predictions = prediction_output.predictions
+        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
+        predictions = tokenizer.batch_decode(
+            predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
+            for i, example in enumerate(data_module['predict_dataset']):
+                example['prediction_with_input'] = predictions[i].strip()
+                example['prediction'] = predictions[i].replace(example['input'], '').strip()
+                fout.write(json.dumps(example) + '\n')
+        print(prediction_metrics)
+        trainer.log_metrics("predict", prediction_metrics)
+        trainer.save_metrics("predict", prediction_metrics)
+        all_metrics.update(prediction_metrics)
+    if (args.do_train or args.do_eval or args.do_predict):
+        with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout:
+            fout.write(json.dumps(all_metrics))
+if __name__ == "__main__":
+    train()
--- a/sft/infer.py
+++ b/sft/infer.py
+from transformers import AutoTokenizer
+import transformers 
+import torch
+import time
+# model = "PY007/TinyLlama-1.1B-Chat-v0.1"
+# model = "PY007/TinyLlama-1.1B-intermediate-step-240k-503b"
+model = "output/503B_FT_lr1e-5_ep5/checkpoint-2920"
+tokenizer = AutoTokenizer.from_pretrained(model)
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model,
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+prompt = "Give me detailed info about Jeo Biden."
+formatted_prompt = (
+    f"### Human: {prompt} ### Assistant:"
+)
+start_time = time.time()
+sequences = pipeline(
+    formatted_prompt,
+    do_sample=True,
+    top_k=50,
+    top_p = 0.9,
+    num_return_sequences=1,
+    repetition_penalty=1.1,
+    max_new_tokens=1024,
+)
+print("infer time：", time.time() - start_time, "s")
+for seq in sequences:
+    print(f"Result: {seq['generated_text']}")
--- a/sft/script.sh
+++ b/sft/script.sh
+# We include a simple full-parameter finetuning & inference script here. Our V0.1 chat model is finetuned using this script. 
+# The FT dataset we use is openassistant-guanaco. For finetuning with less than 4GB RAM, we refer you to the Qlora and bitsandbytes repo.
+# We did not undergone extensive hyperparameter tuning nor choosing more performant FT datasets. 
+# We hope the community can explore on finetuning TinyLlama and come up with better chat models. I will include community-finetuned models in this repo.
+# V0.1
+# CUDA_VISIBLE_DEVICES=0 accelerate launch --main_process_port 1234 sft/finetune.py \
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 sft/finetune.py \
+    --model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-240k-503b \
+    --output_dir ./output/503B_FT_lr1e-5_ep5 \
+    --logging_steps 10 \
+    --save_strategy epoch \
+    --data_seed 42 \
+    --save_total_limit 6 \
+    --evaluation_strategy epoch \
+    --eval_dataset_size 512 \
+    --max_eval_samples 1000 \
+    --per_device_eval_batch_size 1 \
+    --max_new_tokens 32 \
+    --dataloader_num_workers 3 \
+    --group_by_length=False \
+    --logging_strategy steps \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval \
+    --warmup_ratio 0.05 \
+    --lr_scheduler_type constant \
+    --dataset oasst1 \
+    --source_max_len 16 \
+    --target_max_len 512 \
+    --per_device_train_batch_size 4 \
+    --max_steps 0 \
+    --num_train_epochs 5 \
+    --learning_rate 1e-5 \
+    --adam_beta2 0.999 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --seed 0 \
+    --trust_remote_code \
+    --report_to wandb 
+# # V0.2
+# CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 sft/finetune.py \
+#     --model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-480k-1T \
+#     --output_dir ./output/503B_FT_lr1e-5_ep5_top1_2023-08-25 \
+#     --logging_steps 10 \
+#     --save_strategy epoch \
+#     --data_seed 42 \
+#     --save_total_limit 6 \
+#     --evaluation_strategy epoch \
+#     --eval_dataset_size 512 \
+#     --max_eval_samples 1000 \
+#     --per_device_eval_batch_size 1 \
+#     --max_new_tokens 32 \
+#     --dataloader_num_workers 3 \
+#     --group_by_length=False \
+#     --logging_strategy steps \
+#     --remove_unused_columns False \
+#     --do_train \
+#     --do_eval \
+#     --warmup_ratio 0.05 \
+#     --lr_scheduler_type constant \
+#     --dataset OpenAssistant/oasst_top1_2023-08-25 \
+#     --dataset_format oasst1 \
+#     --source_max_len 16 \
+#     --target_max_len 512 \
+#     --per_device_train_batch_size 4 \
+#     --max_steps 0 \
+#     --num_train_epochs 5 \
+#     --learning_rate 1e-5 \
+#     --adam_beta2 0.999 \
+#     --max_grad_norm 1.0 \
+#     --weight_decay 0.0 \
+#     --seed 0 \
+#     --trust_remote_code \
+#     --report_to wandb 
--- a/sft/simple_inference.py
+++ b/sft/simple_inference.py
+from transformers import AutoTokenizer
+import transformers 
+import torch
+model = "PY007/TinyLlama-1.1B-Chat-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model)
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model,
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+prompt = "Give me detailed info about Jeo Biden."
+formatted_prompt = (
+    f"### Human: {prompt} ### Assistant:"
+)
+sequences = pipeline(
+    formatted_prompt,
+    do_sample=True,
+    top_k=50,
+    top_p = 0.9,
+    num_return_sequences=1,
+    repetition_penalty=1.1,
+    max_new_tokens=1024,
+)
+for seq in sequences:
+    print(f"Result: {seq['generated_text']}")
--- a/sft/simple_inference2.py
+++ b/sft/simple_inference2.py
+from transformers import AutoTokenizer
+import transformers 
+import torch
+model = "PY007/TinyLlama-1.1B-Chat-v0.2"
+tokenizer = AutoTokenizer.from_pretrained(model)
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model,
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+prompt = "How to get in a good university?"
+formatted_prompt = (
+    f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+)
+sequences = pipeline(
+    formatted_prompt,
+    do_sample=True,
+    top_k=50,
+    top_p = 0.9,
+    num_return_sequences=1,
+    repetition_penalty=1.1,
+    max_new_tokens=1024,
+)
+for seq in sequences:
+    print(f"Result: {seq['generated_text']}")
\ No newline at end of file
--- a/speculative_decoding/README.md
+++ b/speculative_decoding/README.md
+## Speculative Decoding
+### HuggingFace "Assisted Generation"
+| Large Model | Native Decoding | Assisted Decoding  |
+| ----------- | --------------- | ------------------ |
+| guanaco-7b  | 69  seconds   | 38 seconds      |
+| guanaco-13b | 84 seconds             | 45 seconds                 | 
+| guanaco-33b | 109 seconds             | 62 seconds                 | 
+We use PY007/TinyLlama-1.1B-Chat-v0.1 as the assistant model and vary the large model from guanaco-7B to 33B. Experiments are done on a single A40 GPU with code inside instruct_hf_assisted_decoding.py. TinyLlama is loaded in fp16 and the large models are loaded in 8 bit to make guanaco-33b fit in memory and also to keep a consistent setup. The prompt used is "Give me detailed info about Jeo Biden.". max_new_tokens is set to 512. 
+You can read this [article](https://huggingface.co/blog/assisted-generation) for more information about HuggingFace's Assisted Generation.
+Quote from HF: "due to INT8 quantization and the use of causal masking in assisted generation, the output of greedy decoding may differ in rare occasions."
+#### TODO
+- [ ] Thouroughly benchmark the average speedup on 52K Alpaca prompts.
+### Llama.cpp Speculative Decoding
+We have continue-pretrained a code tinyllama from the 500B checkpoint with another 7B Python data [here](https://huggingface.co/PY007/TinyLlama-1.1B-python-v0.1).
+The code for continue-pretraining can be found in pretrain/tinyllama_code.py
+```
+./speculative \
+-m models/CodeLlama-7b-hf/ggml-model-f16.gguf \
+-md models/TinyLlama-1.1B-500B-python/ggml-model-q4_0.gguf \
+-p "# Quick-sort implementation in Python and sample usage:" \
+-e -ngl 1 -t 4 -n 256 -s 20 --temp 0 --draft 8
+```
+This gives:
+```
+encoded   12 tokens in    0.247 seconds, speed:   48.638 t/s
+decoded  265 tokens in    7.909 seconds, speed:   33.507 t/s
+n_draft   = 16
+n_predict = 265
+n_drafted = 317
+n_accept  = 195
+accept    = 61.514%
+draft:
+llama_print_timings:        load time =    53.14 ms
+llama_print_timings:      sample time =   652.62 ms /     1 runs   (  652.62 ms per token,     1.53 tokens per second)
+llama_print_timings: prompt eval time =    73.81 ms /    12 tokens (    6.15 ms per token,   162.58 tokens per second)
+llama_print_timings:        eval time =  2247.77 ms /   378 runs   (    5.95 ms per token,   168.17 tokens per second)
+llama_print_timings:       total time =  8154.92 ms
+target:
+llama_print_timings:        load time =   534.47 ms
+llama_print_timings:      sample time =   208.12 ms /   265 runs   (    0.79 ms per token,  1273.32 tokens per second)
+llama_print_timings: prompt eval time =  4210.38 ms /   382 tokens (   11.02 ms per token,    90.73 tokens per second)
+llama_print_timings:        eval time =   682.80 ms /    16 runs   (   42.68 ms per token,    23.43 tokens per second)
+llama_print_timings:       total time =  8214.11 ms
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+```
+Even though the model is continue-pretrained exclusively on Python, it retains its ability in other languages, such as C:
+```
+./speculative \
+-m models/CodeLlama-7b-hf/ggml-model-f16.gguf \
+-md models/TinyLlama-1.1B-500B-python/ggml-model-q4_0.gguf \
+-p "// Quick-sort implementation in C (4 spaces indentation + detailed comments) and sample usage:\n\n#include" \
+-e -ngl 1 -t 4 -n 256 -s 20 --temp 0 --draft 8
+```
+This gives:
+```
+encoded   25 tokens in    0.278 seconds, speed:   89.900 t/s
+decoded  258 tokens in    6.432 seconds, speed:   40.112 t/s
+n_draft   = 28
+n_predict = 258
+n_drafted = 278
+n_accept  = 200
+accept    = 71.942%
+draft:
+llama_print_timings:        load time =   932.54 ms
+llama_print_timings:      sample time =   583.50 ms /     1 runs   (  583.50 ms per token,     1.71 tokens per second)
+llama_print_timings: prompt eval time =    81.50 ms /    25 tokens (    3.26 ms per token,   306.73 tokens per second)
+llama_print_timings:        eval time =  1834.67 ms /   329 runs   (    5.58 ms per token,   179.32 tokens per second)
+llama_print_timings:       total time =  6710.30 ms
+target:
+llama_print_timings:        load time = 18568.44 ms
+llama_print_timings:      sample time =   208.78 ms /   258 runs   (    0.81 ms per token,  1235.75 tokens per second)
+llama_print_timings: prompt eval time =  3164.84 ms /   342 tokens (    9.25 ms per token,   108.06 tokens per second)
+llama_print_timings:        eval time =   775.43 ms /    18 runs   (   43.08 ms per token,    23.21 tokens per second)
+llama_print_timings:       total time =  7650.67 ms
+ggml_metal_free: deallocating
+ggml_metal_free: deallocating
+```
+I have not tried 13B CodeLlama as the large model yet because my Mac memory is not enough :).
\ No newline at end of file
--- a/speculative_decoding/instruct_hf_assisted_decoding.py
+++ b/speculative_decoding/instruct_hf_assisted_decoding.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import time
+model_id = "huggyllama/llama-13b"
+peft_model_id = "timdettmers/guanaco-13b"
+assistant_checkpoint = "PY007/TinyLlama-1.1B-Chat-v0.1"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+prompt = "Give me detailed info about Jeo Biden."
+formatted_prompt = f"### Human: {prompt}### Assistant:"
+inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
+model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
+model.load_adapter(peft_model_id)
+print("Large model loaded")
+model.config.use_cache = True
+assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint).half().to(device)  
+assistant_model.config.use_cache = True
+print("Small model loaded")
+print("###Native Decoding Starts...\n")
+start = time.time()
+outputs = model.generate(**inputs, assistant_model=None, max_new_tokens=512)
+end = time.time()
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+print("Time: ", end - start)
+print("###TinyLlama Assisted Decoding Starts...\n")
+start = time.time()
+outputs = model.generate(**inputs, assistant_model=assistant_model,max_new_tokens=512)
+end = time.time()
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+# print time in seconds
+print("Time: ", end - start)
--- a/timdettmers/openassistant-guanaco/openassistant_best_replies_eval.jsonl
+++ b/timdettmers/openassistant-guanaco/openassistant_best_replies_eval.jsonl
--- a/timdettmers/openassistant-guanaco/openassistant_best_replies_train.jsonl
+++ b/timdettmers/openassistant-guanaco/openassistant_best_replies_train.jsonl
--- a/whl.zip
+++ b/whl.zip