test_gpt_generation_cg.py 3.25 KB
Newer Older
1
2
3
4
5
import os
import re
import time

import pytest
Tri Dao's avatar
Tri Dao committed
6
import torch
7
8
9
from einops import rearrange
from flash_attn.models.gpt import GPTLMHeadModel
from flash_attn.utils.generation import update_graph_cache
Tri Dao's avatar
Tri Dao committed
10
from transformers import GPT2Config
11
12
13


def get_logits(model, input_ids, max_length, teacher_outputs=None, **kwargs):
Tri Dao's avatar
Tri Dao committed
14
15
16
17
18
19
20
21
22
23
    out = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        fused_ft_kernel=True,
        teacher_outputs=teacher_outputs,
        return_dict_in_generate=True,
        output_scores=True,
        timing=True,
        **kwargs,
    )
24
25
26
    return torch.stack(out.scores, dim=1)


Tri Dao's avatar
Tri Dao committed
27
@pytest.mark.parametrize("seqlen,maxlen", [(10, 20), (30, 150), (3000, 3400), (14000, 15000)])
28
# @pytest.mark.parametrize('seqlen,maxlen', [(10, 20)])
Tri Dao's avatar
Tri Dao committed
29
@pytest.mark.parametrize("rotary", [None, "interleaved", "block"])
30
# @pytest.mark.parametrize('rotary', [None])
Tri Dao's avatar
Tri Dao committed
31
@pytest.mark.parametrize("model_name", ["gpt2"])
32
def test_greedy_decode_gpt2_cg(model_name, rotary, seqlen, maxlen):
Tri Dao's avatar
Tri Dao committed
33
    """Check that decoding with CUDA graph is the same as decoding without CUDA graph."""
34
    dtype = torch.float16
Tri Dao's avatar
Tri Dao committed
35
    device = "cuda"
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    rtol, atol = 3e-3, 3e-1
    config = GPT2Config.from_pretrained(model_name)
    config.n_positions = 16 * 1024
    assert seqlen <= maxlen <= config.n_positions
    if rotary is not None:
        config.n_positions = 0
        config.rotary_emb_dim = 32
        config.rotary_emb_interleaved = rotary == "interleaved"
    config.residual_in_fp32 = True
    config.use_flash_attn = True
    config.fused_bias_fc = True
    config.fused_mlp = True
    config.fused_dropout_add_ln = True

    model = GPTLMHeadModel(config, device=device, dtype=dtype)
    model.eval()

    torch.manual_seed(0)
    batch_size = 1
Tri Dao's avatar
Tri Dao committed
55
56
57
58
59
60
    input_ids = torch.randint(
        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
    )
    teacher_outputs = torch.randint(
        0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device
    )
61
62
63
64
65
66
67
68

    logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs)
    logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True)
    assert torch.equal(logits, logits_cg)

    # Try increasing batch size and seqlen, then decrease them to see if it's still correct
    batch_size = 3
    maxlen += 30
Tri Dao's avatar
Tri Dao committed
69
70
71
72
73
74
    input_ids = torch.randint(
        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
    )
    teacher_outputs = torch.randint(
        0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device
    )
75
76
77
78
79
80
    logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs)
    logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True)
    assert torch.equal(logits, logits_cg)

    batch_size = 2
    maxlen -= 35
Tri Dao's avatar
Tri Dao committed
81
82
83
84
85
86
    input_ids = torch.randint(
        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
    )
    teacher_outputs = torch.randint(
        0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device
    )
87
88
89
    logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs)
    logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True)
    assert torch.equal(logits, logits_cg)