build_dataset.py 785 Bytes
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import json

import transformers
import wikipedia

name = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(name)
city_names = ["los angles", "london", "tokyo", "beijing", "singapore"]


for city_name in city_names:
    content = str(wikipedia.page(city_name).content)
    content = content.replace("\n\n", "\n")

    tokens = t.encode(content)

    truncate_len = int((10000 / len(tokens)) * len(content))
    truncate_content = content[:truncate_len]
    truncate_tokens = t.encode(truncate_content)

    # Count token
Liangsheng Yin's avatar
Liangsheng Yin committed
22
23
24
    print(
        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
    )
Lianmin Zheng's avatar
Lianmin Zheng committed
25
26
27

    with open("questions.jsonl", "a") as fout:
        fout.write(json.dumps({"document": truncate_content}) + "\n")