build_dataset.py 2.03 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
3
4
5
import json

import transformers

content = "\n".join(
Liangsheng Yin's avatar
Liangsheng Yin committed
6
7
    open("llama2.txt", "r", encoding="utf-8", errors="ignore").readlines()
)
Lianmin Zheng's avatar
Lianmin Zheng committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
content = content.replace("\n\n", "\n")

# Count token
name = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(name)
print(f"num tokens: {len(t.encode(content))}")

# Segment
SEP = "\n\n"
parts = content.split(SEP)
print(f"num segments: {len(parts)}")

segment_len = 1100

segments = []
tmp = []
tmp_len = 0
for i in range(len(parts)):
    tmp.append(parts[i])
    tmp_len += len(t.encode(parts[i]))

    if tmp_len > segment_len:
        segments.append(SEP.join(tmp))
        tmp = []
        tmp_len = 0

for i, s in enumerate(segments):
    print(i, len(t.encode(segments[i])))

# Dump
with open("questions.jsonl", "w") as fout:
Liangsheng Yin's avatar
Liangsheng Yin committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    fout.write(
        json.dumps(
            {
                "documents": segments[:30],
                "questions": [
                    "What is the name of the fine-tuned LLMs?",
                    "Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
                    "What is the number of parameters in the largest Llama 2 model?",
                    "What is the batch size of fine-tuning?",
                    "Where can we find the details of potential data contamination?",
                    "What is the full name of MPT?",
                    "What is the power consumption of RSC in Watt?",
                    "How many tokens of data do they train on?",
                    "Which model's release is delayed due to a lack of time to sufficiently red team?",
                    "Which activation function is used in Llama?",
                ],
                "answers": [
                    "Llama 2 Chat",
                    "1",
                    "70 B",
                    "64",
                    "A 6",
                    "MosaicML",
                    "400",
                    "2 trillion",
                    "34 B",
                    "SwiGLU",
                ],
            }
        )
        + "\n"
    )