yuan_inference.py 1.48 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
import os
Rayyyyy's avatar
Rayyyyy committed
2
3
4
import time
import argparse

Rayyyyy's avatar
Rayyyyy committed
5
from transformers import LlamaTokenizer
Rayyyyy's avatar
Rayyyyy committed
6
7
8
9
10
11
from vllm import LLM, SamplingParams

## params
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', default='', help='model path')
args = parser.parse_args()
Rayyyyy's avatar
Rayyyyy committed
12

Rayyyyy's avatar
Rayyyyy committed
13
14
15
model_path = args.model_path

tokenizer = LlamaTokenizer.from_pretrained(model_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
Rayyyyy's avatar
Rayyyyy committed
16
17
18
19
20
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)

prompts = ["写一篇春游作文"]
sampling_params = SamplingParams(max_tokens=300, temperature=1, top_p=0, top_k=1, min_p=0.0, length_penalty=1.0, repetition_penalty=1.0, stop="<eod>", )

Rayyyyy's avatar
Rayyyyy committed
21
22
23
## init model
llm = LLM(model=model_path, trust_remote_code=True, tensor_parallel_size=8, gpu_memory_utilization=0.8, disable_custom_all_reduce=True, max_num_seqs=1)
## inference
Rayyyyy's avatar
Rayyyyy committed
24
25
26
27
28
29
30
31
32
33
34
35
36
start_time = time.time()
outputs = llm.generate(prompts, sampling_params)
end_time = time.time()
total_tokens = 0
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    num_tokens = len(tokenizer.encode(generated_text, return_tensors="pt")[0])
    total_tokens += num_tokens
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

print("inference_time:", (end_time - start_time))
print("total_tokens:", total_tokens)