samplers.py 2.96 KB
Newer Older
1
2


3
class Sampler:
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

    def __init__(self, docs, task, fewshot_indices=None, rnd=None):

        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"

        self.task = task
        self.config = task._config

        self.delimiter = self.config.delimiter

        self.docs = docs # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices: # subset few-shot docs from 
            self.docs = self.docs.select(fewshot_indices)

19

20
21
    def get_context(self, doc, num_fewshot):

22
        # draw an extra fewshot sample if using same split as evaluting on
23
24
        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot 

25
        # draw `n_samples` docs from fewshot_docs
26
27
28
        fewshotex = self.sample(n_samples)

        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
29
        # TODO: should we just stop people from using fewshot from same split as evaluating?
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
        
        labeled_examples = (
                self.delimiter.join(
                    [
                        self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
                        for doc in selected_docs
                    ]
                )
                + self.delimiter
            )

        # only returns the fewshot context! Does not append the document, do this outside the object
        return labeled_examples

    def sample(self, n):
        """
        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
        """

        return self.rnd.sample(self.docs, n)


class BalancedSampler(Sampler):

    def sample(self, n):
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples. 
58
        TODO: what order should they be in? maybe random?
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
        """

        pass

class ManualSampler(Sampler):

    def sample(self, n):
        """

        """
        pass 


# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init. 
# Depends what's easier for new user to add own functionality on top of

# types of sampler:
# - class-balanced, randomly shuffled
# - class-balanced, one particular set of fewshot examples for all evaled instances
# - hand-specify number of fewshot examples per class?
# - random, varies per example (check that this is curr. default in old repo)
# - random, unified per example
# - enforce a specific fixed fewshot string! (or should we not use this, in favor of including it in prompt template directly)


# - user-specified doc indices to restrict fewshot doc options to
# - user specifies split to use for drawing fewshot instances (TODO: manually prevent this from being same split you eval!)
# - user specifies a prepended "description"/string to add in front of the (prompted) input

# - user specifies a location to draw fewshot samples from? DO THIS IN TASK CLASS