from awq import AutoAWQForCausalLM from transformers import AutoTokenizer, TextStreamer quant_path = "TheBloke/Mistral-7B-OpenOrca-AWQ" # Load model model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True, safetensors=True) tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Convert prompt to tokens prompt_template = """\ <|im_start|>system You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!<|im_end|> <|im_start|>user {prompt}<|im_end|> <|im_start|>assistant""" prompt = "You're standing on the surface of the Earth. "\ "You walk one mile south, one mile west and one mile north. "\ "You end up exactly where you started. Where are you?" tokens = tokenizer( prompt_template.format(prompt=prompt), return_tensors='pt' ).input_ids.cuda() # Generate output generation_output = model.generate( tokens, streamer=streamer, max_new_tokens=512 )