run_camembert.py 1.86 KB
Newer Older
Louis MARTIN's avatar
Louis MARTIN committed
1
2
import torch

3
from transformers.modeling_camembert import CamembertForMaskedLM
Aymeric Augustin's avatar
Aymeric Augustin committed
4
from transformers.tokenization_camembert import CamembertTokenizer
Louis MARTIN's avatar
Louis MARTIN committed
5
6
7
8


def fill_mask(masked_input, model, tokenizer, topk=5):
    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
9
    assert masked_input.count("<mask>") == 1
Louis MARTIN's avatar
Louis MARTIN committed
10
11
12
13
14
15
    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
    logits = logits[0, masked_index, :]
    prob = logits.softmax(dim=0)
    values, indices = prob.topk(k=topk, dim=0)
16
17
18
    topk_predicted_token_bpe = " ".join(
        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
    )
Louis MARTIN's avatar
Louis MARTIN committed
19
20
    masked_token = tokenizer.mask_token
    topk_filled_outputs = []
21
22
    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
        predicted_token = predicted_token_bpe.replace("\u2581", " ")
Louis MARTIN's avatar
Louis MARTIN committed
23
        if " {0}".format(masked_token) in masked_input:
24
25
26
27
28
29
30
            topk_filled_outputs.append(
                (
                    masked_input.replace(" {0}".format(masked_token), predicted_token),
                    values[index].item(),
                    predicted_token,
                )
            )
Louis MARTIN's avatar
Louis MARTIN committed
31
        else:
32
33
34
            topk_filled_outputs.append(
                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
            )
Louis MARTIN's avatar
Louis MARTIN committed
35
36
37
    return topk_filled_outputs


38
39
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")
Louis MARTIN's avatar
Louis MARTIN committed
40
41
42
43
model.eval()

masked_input = "Le camembert est <mask> :)"
print(fill_mask(masked_input, model, tokenizer, topk=3))