test_latency.py 376 Bytes
Newer Older
zhaoying1's avatar
zhaoying1 committed
1
2
3
4
5
6
7
8
9
10
from transformers import AutoModel, AutoTokenizer
import os


tokenizer = AutoTokenizer.from_pretrained("/zhaoy/chatglm-6b-moel", trust_remote_code=True)
model = AutoModel.from_pretrained("/zhaoy/chatglm-6b-moel", trust_remote_code=True).half().cuda()
model = model.eval()
text = "你好"
response, history = model.measure_latency(tokenizer, text, history=[])
print(response)