eval_utils.py 5.26 KB
Newer Older
Casper's avatar
Casper committed
1
2
3
4
5
6
7
8
9
10
11
import torch
import torch.nn as nn
from tqdm import tqdm
from lm_eval import evaluator
from datasets import load_dataset
from transformers import pipeline
from evaluate import load as load_metric
from lm_eval.tasks import initialize_tasks
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

Casper's avatar
Casper committed
12

Casper's avatar
Casper committed
13
14
def get_device():
    if torch.backends.mps.is_available():
Casper's avatar
Casper committed
15
        return "mps"
Casper's avatar
Casper committed
16
    elif torch.cuda.is_available():
Casper's avatar
Casper committed
17
        return "cuda:0"
Casper's avatar
Casper committed
18
    else:
Casper's avatar
Casper committed
19
20
        return "cpu"

Casper's avatar
Casper committed
21
22
23
24

def evaluate_perplexity(model, tokenizer):
    def _perplexity(nlls, n_samples, seqlen):
        return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
Casper's avatar
Casper committed
25

Casper's avatar
Casper committed
26
    # load and prepare dataset
Casper's avatar
Casper committed
27
28
    data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
Casper's avatar
Casper committed
29
30
31
32
33
    data = data.input_ids.to(model.device)

    seqlen = 2048
    model = model.eval()
    n_samples = data.numel() // seqlen
Casper's avatar
Casper committed
34

Casper's avatar
Casper committed
35
36
37
38
    nlls = []

    with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
        for i in progress_bar:
Casper's avatar
Casper committed
39
40
            start_index = i * seqlen
            end_index = (i + 1) * seqlen
Casper's avatar
Casper committed
41
42
43
44
45
46
            batch = data[:, start_index:end_index].to(model.device)
            with torch.no_grad():
                logits = model(batch).logits
            shift_logits = logits[:, :-1, :].contiguous().float()
            shift_labels = data[:, start_index:end_index][:, 1:]
            loss_fct = nn.CrossEntropyLoss()
Casper's avatar
Casper committed
47
48
49
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )
Casper's avatar
Casper committed
50
51
52
            neg_log_likelihood = loss.float() * seqlen
            nlls.append(neg_log_likelihood)

Casper's avatar
Casper committed
53
            curr_ppl = _perplexity(nlls, i + 1, seqlen)
Casper's avatar
Casper committed
54
55
56
            progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")

    ppl = _perplexity(nlls, n_samples, seqlen)
Casper's avatar
Casper committed
57

Casper's avatar
Casper committed
58
59
    return ppl.item()

Casper's avatar
Casper committed
60

Casper's avatar
Casper committed
61
62
63
64
65
def eval_librispeech(model_id, num_samples=100, batch_size=4):
    try:
        import jiwer, librosa, soundfile
    except ImportError:
        print("Please install the following: pip install jiwer librosa soundfile")
Casper's avatar
Casper committed
66

Casper's avatar
Casper committed
67
68
69
70
71
72
73
74
75
76
77
78
79
    dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True)

    # Load the Whisper model pipeline for automatic speech recognition
    pipe = pipeline(
        task="automatic-speech-recognition",
        model=model_id,
        batch_size=batch_size,
        device=get_device(),
        torch_dtype=torch.float16,
    )

    # Word normalizer
    normalizer = BasicTextNormalizer()
Casper's avatar
Casper committed
80

Casper's avatar
Casper committed
81
82
83
84
85
86
    # Load the WER metric
    wer_metric = load_metric("wer")

    texts = []
    audio = []
    for i, data in tqdm(enumerate(dataset), total=num_samples, desc="Loading dataset"):
Casper's avatar
Casper committed
87
88
        if len(audio) == num_samples:
            break
Casper's avatar
Casper committed
89
90
91
92
93
94
95
96
        audio.append(data["audio"])
        texts.append(data["text"])

    references = []
    predictions = []

    with tqdm(range(0, num_samples, batch_size), desc="Word Error Rate: -") as pbar:
        for i in pbar:
Casper's avatar
Casper committed
97
98
            batch_audio = audio[i : i + batch_size]
            batch_texts = texts[i : i + batch_size]
Casper's avatar
Casper committed
99
100
101
102
103
104
105
106
107
108
109
110

            # inference
            results = pipe(batch_audio, batch_size=len(batch_audio))

            # normalize text
            normalized_predictions = [normalizer(result["text"]) for result in results]
            normalized_texts = [normalizer(text) for text in batch_texts]

            predictions.extend(normalized_predictions)
            references.extend(normalized_texts)

            # word error rate computation
Casper's avatar
Casper committed
111
112
113
            wer = (
                wer_metric.compute(predictions=predictions, references=references) * 100
            )
Casper's avatar
Casper committed
114
115
            pbar.set_description(f"Word Error Rate: {wer:.3f}%")

Casper's avatar
Casper committed
116
117
118
119
120
121
122
123

def eval_mmlu(
    model_path="gpt2",
    num_fewshot=1,
    batch_size=1,
    device="cuda:0",
    task_use_pretrained=False,
):
Casper's avatar
Casper committed
124
125
    try:
        import vllm
Casper's avatar
Casper committed
126

Casper's avatar
Casper committed
127
128
129
        VLLM_INSTALLED = True
    except ImportError:
        VLLM_INSTALLED = False
Casper's avatar
Casper committed
130

Casper's avatar
Casper committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
    initialize_tasks(verbosity="DEBUG")

    if VLLM_INSTALLED:
        model = "vllm"
        model_args = dict(
            pretrained=model_path,
            max_model_len=2048,
            dtype="float16",
            trust_remote_code=True,
        )

        if not task_use_pretrained:
            model_args["quantization"] = "awq"
    else:
        model = "hf"
        model_args = dict(
            pretrained=model_path,
            device_map_option=device,
            dtype="float16",
            trust_remote_code=True,
        )
Casper's avatar
Casper committed
152
    model_args = ",".join([f"{k}={v}" for k, v in model_args.items()])
Casper's avatar
Casper committed
153
154
155
156

    results = evaluator.simple_evaluate(
        model=model,
        model_args=model_args,
Casper's avatar
Casper committed
157
        tasks=["mmlu"],
Casper's avatar
Casper committed
158
159
160
161
162
163
164
165
        num_fewshot=num_fewshot,
        batch_size=batch_size,
        device=device,
        log_samples=False,
    )

    print(evaluator.make_table(results))

Casper's avatar
Casper committed
166
167

if __name__ == "__main__":
Casper's avatar
Casper committed
168
169
170
171
172
173
174
175
    ### PERPLEXITY
    # model_path = 'mistralai/Mistral-7B-Instruct-v0.1'
    # model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    # tokenizer = AutoTokenizer.from_pretrained(model_path)
    # evaluate_perplexity(model, tokenizer)

    ### WORD ERROR RATE
    # model_id = "distil-whisper/distil-small.en" # 3.594
Casper's avatar
Casper committed
176
    model_id = "distil-whisper/distil-medium.en"  # 3.436
Casper's avatar
Casper committed
177
    eval_librispeech(model_id)