from transformers import AutoTokenizer, TextGenerationPipeline from auto_gptq import AutoGPTQForCausalLM quantized_model_dir = "./Qwen1.5-7B-4bit-gptq-4" def main(): tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True) model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0") # inference with model.generate print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0])) # or you can also use pipeline # pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer) # print(pipeline("auto-gptq is")[0]["generated_text"]) if __name__ == "__main__": import logging logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S", ) main()