int8_inference_huggingface.py 704 Bytes
Newer Older
1
import torch
2
from transformers import LlamaForCausalLM, LlamaTokenizer
3
4

MAX_NEW_TOKENS = 128
5
model_name = 'meta-llama/Llama-2-7b-hf'
6
7

text = 'Hamburg is in which country?\n'
8
tokenizer = LlamaTokenizer.from_pretrained(model_name)
9
10
11
12
13
14
15
input_ids = tokenizer(text, return_tensors="pt").input_ids

max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

16
model = LlamaForCausalLM.from_pretrained(
17
18
19
20
21
  model_name,
  device_map='auto',
  load_in_8bit=True,
  max_memory=max_memory
)
22

23
24
generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))