samplers.py 4.82 KB
Newer Older
haileyschoelkopf's avatar
haileyschoelkopf committed
1
class ContextSampler:
Ethan Smith's avatar
Ethan Smith committed
2
    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
3
4
5
6
7
8
        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"

        self.task = task
        self.config = task._config

Lintang Sutawika's avatar
Lintang Sutawika committed
9
10
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter
11

12
13
14
15
        self.doc_to_text = self.task.doc_to_text
        self.doc_to_target = self.task.doc_to_target
        self.doc_to_choice = self.task.doc_to_choice

lintangsutawika's avatar
lintangsutawika committed
16
17
        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices:  # subset few-shot docs from
18
19
20
            self.docs = self.docs.select(fewshot_indices)

    def get_context(self, doc, num_fewshot):
lintangsutawika's avatar
lintangsutawika committed
21
22
23
24
25
26
        # draw an extra fewshot sample if using same split as evaluating on
        n_samples = (
            num_fewshot + 1
            if self.config.fewshot_split == self.config.test_split
            else num_fewshot
        )
27

28
        # draw `n_samples` docs from fewshot_docs
29
30
31
        fewshotex = self.sample(n_samples)

        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
32
        # TODO: should we just stop people from using fewshot from same split as evaluating?
33
        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
lintangsutawika's avatar
lintangsutawika committed
34

35
        labeled_examples = (
36
            self.fewshot_delimiter.join(
lintangsutawika's avatar
lintangsutawika committed
37
                [
38
                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
39
40
41
42
43
44
45
46
                    (
                        self.doc_to_text(doc)
                        if (
                            self.config.doc_to_choice is None
                            or type(self.doc_to_text(doc)) is str
                        )
                        else self.doc_to_choice(doc)[self.doc_to_text(doc)]
                    )
47
                    + self.target_delimiter
48
                    + (
baberabb's avatar
baberabb committed
49
                        str(self.doc_to_target(doc)[0])
50
51
                        if type(self.doc_to_target(doc)) is list
                        else self.doc_to_target(doc)
52
53
54
55
                        if (
                            self.config.doc_to_choice is None
                            or type(self.doc_to_target(doc)) is str
                        )
baberabb's avatar
baberabb committed
56
                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
57
                    )
lintangsutawika's avatar
lintangsutawika committed
58
59
                    for doc in selected_docs
                ]
60
            )
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
61
            + self.fewshot_delimiter
lintangsutawika's avatar
lintangsutawika committed
62
        )
63
64
65
66
67
68
69
70
71
72
73

        return labeled_examples

    def sample(self, n):
        """
        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
        """

        return self.rnd.sample(self.docs, n)


haileyschoelkopf's avatar
haileyschoelkopf committed
74
75
76
77
78
79
80
81
82
83
84
85
86
class FirstNSampler(ContextSampler):
    def sample(self, n) -> None:
        """
        Draw the first `n` samples in order from the specified split.
        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
        """
        assert n <= len(
            self.docs
        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
        return self.docs[:n]


class BalancedSampler(ContextSampler):
Ethan Smith's avatar
Ethan Smith committed
87
    def sample(self, n) -> None:
88
        """
lintangsutawika's avatar
lintangsutawika committed
89
        TODO: this should return approximately class-balanced samples from our fewshot examples.
90
        TODO: what order should they be in? maybe random?
91
92
93
94
95
        """

        pass


haileyschoelkopf's avatar
haileyschoelkopf committed
96
class ManualSampler(ContextSampler):
Ethan Smith's avatar
Ethan Smith committed
97
    def sample(self, n) -> None:
lintangsutawika's avatar
lintangsutawika committed
98
99
        """ """
        pass
100
101


haileyschoelkopf's avatar
haileyschoelkopf committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
SAMPLER_REGISTRY = {
    "default": ContextSampler,
    "first_n": FirstNSampler,
}


def get_sampler(name):
    try:
        return SAMPLER_REGISTRY[name]
    except KeyError:
        raise ValueError(
            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
        )


lintangsutawika's avatar
lintangsutawika committed
117
# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Depends what's easier for new user to add own functionality on top of

# types of sampler:
# - class-balanced, randomly shuffled
# - class-balanced, one particular set of fewshot examples for all evaled instances
# - hand-specify number of fewshot examples per class?
# - random, varies per example (check that this is curr. default in old repo)
# - random, unified per example
# - enforce a specific fixed fewshot string! (or should we not use this, in favor of including it in prompt template directly)


# - user-specified doc indices to restrict fewshot doc options to
# - user specifies split to use for drawing fewshot instances (TODO: manually prevent this from being same split you eval!)
# - user specifies a prepended "description"/string to add in front of the (prompted) input

# - user specifies a location to draw fewshot samples from? DO THIS IN TASK CLASS