from transformers import AutoTokenizer import transformers import torch import time # model = "PY007/TinyLlama-1.1B-Chat-v0.1" # model = "PY007/TinyLlama-1.1B-intermediate-step-240k-503b" model = "output/503B_FT_lr1e-5_ep5/checkpoint-2920" tokenizer = AutoTokenizer.from_pretrained(model) pipeline = transformers.pipeline( "text-generation", model=model, torch_dtype=torch.float16, device_map="auto", ) prompt = "Give me detailed info about Jeo Biden." formatted_prompt = ( f"### Human: {prompt} ### Assistant:" ) start_time = time.time() sequences = pipeline( formatted_prompt, do_sample=True, top_k=50, top_p = 0.9, num_return_sequences=1, repetition_penalty=1.1, max_new_tokens=1024, ) print("infer time:", time.time() - start_time, "s") for seq in sequences: print(f"Result: {seq['generated_text']}")