long.py 426 Bytes
Newer Older
Nicolas Patry's avatar
Nicolas Patry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import datasets
import json


dataset = datasets.load_dataset("ccdv/govreport-summarization")
max_new_tokens = 50


conversations = []

for i, item in enumerate(dataset["test"]):
    report = item["report"]

    messages = [{"from": "human", "value": f"Summarize this report: ```{report}```"}]

    conversations.append({"conversations": messages})

with open("long.json", "w") as f:
    json.dump(conversations, f, indent=4)