test_metrics.py 6.31 KB
Newer Older
1
2
3
import math
from unittest.mock import MagicMock

4
5
6
import pytest
import torch

7
8
9
10
11
12
from vllm.spec_decode.metrics import AsyncMetricsCollector


def test_initial_call_returns_none():
    """Expect first call to get metrics to return None.
    """
13
14
15
16
17
18
19
20
21
22
    spec_decode_sampler = MagicMock()
    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
                                                           dtype=torch.long,
                                                           device='cuda')
    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
                                                          dtype=torch.long,
                                                          device='cuda')
    spec_decode_sampler.num_draft_tokens = 0

    collector = AsyncMetricsCollector(spec_decode_sampler)
23
24
25
26
27
28
29
30
    collector.init_gpu_tensors(rank=0)
    maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
    assert maybe_metrics is None


def test_second_call_returns_metrics():
    """Expect second call to not return None.
    """
31
32
33
34
35
36
37
38
    spec_decode_sampler = MagicMock()
    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
                                                           dtype=torch.long,
                                                           device='cuda')
    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
                                                          dtype=torch.long,
                                                          device='cuda')
    spec_decode_sampler.num_draft_tokens = 0
39
40
41
42
43
44
45

    collect_interval_s = 5.0
    timer = MagicMock()
    timer.side_effect = [
        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
    ]

46
    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
47
48
49
50
51
52
53
54
55
56
57
58
                                      timer=timer,
                                      collect_interval_s=collect_interval_s)
    collector.init_gpu_tensors(rank=0)
    _ = collector.maybe_collect_rejsample_metrics(k=5)
    metrics = collector.maybe_collect_rejsample_metrics(k=5)
    assert metrics is not None


@pytest.mark.parametrize("rank", [1, 2, 3, 4])
def test_nonzero_rank_noop(rank):
    """Verify nonzero ranks don't collect metrics.
    """
59
60
61
62
63
64
65
66
67
68
    spec_decode_sampler = MagicMock()
    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
                                                           dtype=torch.long,
                                                           device='cuda')
    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
                                                          dtype=torch.long,
                                                          device='cuda')
    spec_decode_sampler.num_draft_tokens = 0

    collector = AsyncMetricsCollector(spec_decode_sampler)
69
70
71
72
73
74
75
76
77
    collector.init_gpu_tensors(rank=rank)
    _ = collector.maybe_collect_rejsample_metrics(k=5)
    metrics = collector.maybe_collect_rejsample_metrics(k=5)
    assert metrics is None


def test_noop_until_time():
    """Verify metrics aren't collected until enough time passes.
    """
78
79
80
81
82
83
84
85
    spec_decode_sampler = MagicMock()
    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
                                                           dtype=torch.long,
                                                           device='cuda')
    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
                                                          dtype=torch.long,
                                                          device='cuda')
    spec_decode_sampler.num_draft_tokens = 0
86
87
88
89
90
91
92
93

    collect_interval_s = 5.0
    timer = MagicMock()
    timer.side_effect = [
        0.0, collect_interval_s - 0.1, collect_interval_s - 0.1,
        collect_interval_s + 0.1, collect_interval_s + 0.1
    ]

94
    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                                      timer=timer,
                                      collect_interval_s=collect_interval_s)
    collector.init_gpu_tensors(rank=0)

    _ = collector.maybe_collect_rejsample_metrics(k=5)
    metrics = collector.maybe_collect_rejsample_metrics(k=5)
    assert metrics is None

    _ = collector.maybe_collect_rejsample_metrics(k=5)
    metrics = collector.maybe_collect_rejsample_metrics(k=5)
    assert metrics is not None


@pytest.mark.parametrize("has_data", [True, False])
def test_initial_metrics_has_correct_values(has_data: bool):
    """Test correctness of metrics data.
    """
    if has_data:
        num_accepted_tokens = 103
        num_emitted_tokens = 104
        num_draft_tokens = 105
    else:
        num_accepted_tokens = 0
        num_emitted_tokens = 0
        num_draft_tokens = 0
    k = 5

122
    max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
123
124
        num_draft_tokens, k)

125
126
127
128
129
130
131
132
    spec_decode_sampler = MagicMock()
    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
                                                           dtype=torch.long,
                                                           device='cuda')
    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
                                                          dtype=torch.long,
                                                          device='cuda')
    spec_decode_sampler.num_draft_tokens = num_draft_tokens
133
134
135
136
137
138
139

    collect_interval_s = 5.0
    timer = MagicMock()
    timer.side_effect = [
        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
    ]

140
    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
141
142
143
144
145
146
147
148
149
150
151
152
                                      timer=timer,
                                      collect_interval_s=collect_interval_s)
    collector.init_gpu_tensors(rank=0)
    _ = collector.maybe_collect_rejsample_metrics(k)
    metrics = collector.maybe_collect_rejsample_metrics(k)

    assert metrics.num_spec_tokens == k
    assert metrics.accepted_tokens == num_accepted_tokens
    assert metrics.draft_tokens == num_draft_tokens
    assert metrics.emitted_tokens == num_emitted_tokens

    if has_data:
153
154
155
        assert (metrics.draft_acceptance_rate == num_accepted_tokens /
                num_draft_tokens)
        assert (metrics.system_efficiency == num_emitted_tokens /
156
                max_num_emitted_tokens)
157
158
159
    else:
        assert math.isnan(metrics.draft_acceptance_rate)
        assert math.isnan(metrics.system_efficiency)