instruct_hf_assisted_decoding.py

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time


model_id = "huggyllama/llama-13b"
peft_model_id = "timdettmers/guanaco-13b"
assistant_checkpoint = "PY007/TinyLlama-1.1B-Chat-v0.1"


device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_id)


prompt = "Give me detailed info about Jeo Biden."
formatted_prompt = f"### Human: {prompt}### Assistant:"
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
model.load_adapter(peft_model_id)
print("Large model loaded")
model.config.use_cache = True
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint).half().to(device)  
assistant_model.config.use_cache = True
print("Small model loaded")


print("###Native Decoding Starts...\n")
start = time.time()
outputs = model.generate(**inputs, assistant_model=None, max_new_tokens=512)
end = time.time()
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
print("Time: ", end - start)

print("###TinyLlama Assisted Decoding Starts...\n")
start = time.time()
outputs = model.generate(**inputs, assistant_model=assistant_model,max_new_tokens=512)
end = time.time()
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
# print time in seconds
print("Time: ", end - start)