from transformers import AutoModel, AutoTokenizer import os tokenizer = AutoTokenizer.from_pretrained("/zhaoy/chatglm-6b-moel", trust_remote_code=True) model = AutoModel.from_pretrained("/zhaoy/chatglm-6b-moel", trust_remote_code=True).half().cuda() model = model.eval() text = "你好" response, history = model.measure_latency(tokenizer, text, history=[]) print(response)