Fix example int8_inference_huggingface.py (#414)

* Fix example int8_inference_huggingface.py * Update examples/int8_inference_huggingface.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

Fix example int8_inference_huggingface.py (#414)
* Fix example int8_inference_huggingface.py * Update examples/int8_inference_huggingface.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
4b232edf · Alejandro Rodríguez Salamanca · GitHub · cc5f8cd8 · 4b232edf
Unverified Commit 4b232edf authored Feb 27, 2024 by Alejandro Rodríguez Salamanca Committed by GitHub Feb 27, 2024
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 5 deletions

examples/int8_inference_huggingface.py examples/int8_inference_huggingface.py +5 -5

No files found.
--- a/examples/int8_inference_huggingface.py
+++ b/examples/int8_inference_huggingface.py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import LlamaForCausalLM, LlamaTokenizer

 MAX_NEW_TOKENS = 128
-model_name = 'decapoda-research/llama-7b-hf'
+model_name = 'meta-llama/Llama-2-7b-hf'

 text = 'Hamburg is in which country?\n'
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer = LlamaTokenizer.from_pretrained(model_name)
 input_ids = tokenizer(text, return_tensors="pt").input_ids

-free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
 max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

 n_gpus = torch.cuda.device_count()
 max_memory = {i: max_memory for i in range(n_gpus)}

-model = AutoModelForCausalLM.from_pretrained(
+model = LlamaForCausalLM.from_pretrained(
  model_name,
  device_map='auto',
  load_in_8bit=True,
  max_memory=max_memory
 )
+
 generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
 print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))