build_dataset.py 1.25 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json

import transformers
import wikipedia

model_path = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(model_path)
city_names = [
    "los angles",
    "london",
    "tokyo",
    "beijing",
    "singapore",
    "paris",
    "dubai",
    "sydney",
    "moscow",
    "rome",
    "toronto",
    "rio de janeiro",
    "istanbul",
    "berlin",
    "auckland",
    "buenos aires",
    "mexico city",
    "mumbai",
    "seoul",
    "bangkok",
    "cairo",
    "athens",
    "jerusalem",
]


def get_content(city_name):
    content = str(wikipedia.page(city_name).content)
    content = content.replace("\n\n", "\n")

    tokens = t.encode(content)

    expected_tokens = 3000
    truncate_len = int((expected_tokens / len(tokens)) * len(content))
    truncate_content = content[:truncate_len]
    truncate_tokens = t.encode(truncate_content)

    # Count token
    print(
        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
    )

    return truncate_content


if __name__ == "__main__":
    with open("questions.jsonl", "w") as fout:
        for city_name in city_names:
            truncate_content = get_content(city_name)
            fout.write(json.dumps({"document": truncate_content}) + "\n")