"applications/Chat/coati/vscode:/vscode.git/clone" did not exist on "7b9b86441fbffdd07021f234ec88d0dbc470fa5c"
PairedFilesReader.py 1.03 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
import gzip

Rayyyyy's avatar
Rayyyyy committed
3
4
from . import InputExample

Rayyyyy's avatar
Rayyyyy committed
5
6

class PairedFilesReader(object):
Rayyyyy's avatar
Rayyyyy committed
7
    """Reads in the a Pair Dataset, split in two files"""
Rayyyyy's avatar
Rayyyyy committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

    def __init__(self, filepaths):
        self.filepaths = filepaths

    def get_examples(self, max_examples=0):
        fIns = []
        for filepath in self.filepaths:
            fIn = (
                gzip.open(filepath, "rt", encoding="utf-8")
                if filepath.endswith(".gz")
                else open(filepath, encoding="utf-8")
            )
            fIns.append(fIn)

        examples = []

        eof = False
        while not eof:
            texts = []
            for fIn in fIns:
                text = fIn.readline()

                if text == "":
                    eof = True
                    break

                texts.append(text)

            if eof:
                break

            examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples