extract_prompts.py 1.13 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python3
import argparse


def extract_prompt(line: str) -> str | None:
    # Extract the content between the first '|' and the second '|'
    i = line.find("|")
    if i == -1:
        return None
    j = line.find("|", i + 1)
    if j == -1:
        return None
    return line[i + 1 : j].strip()


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", "-i", required=True, help="Input .lst file path")
    parser.add_argument("--output", "-o", required=True, help="Output file path")
    parser.add_argument(
        "--topk",
        "-k",
        type=int,
        default=100,
        help="Extract the top K prompts (default: 100)",
    )
    args = parser.parse_args()

    prompts = []
    with open(args.input, encoding="utf-8", errors="ignore") as f:
        for line in f:
            if len(prompts) >= args.topk:
                break
            p = extract_prompt(line.rstrip("\n"))
            if p:
                prompts.append(p)

    with open(args.output, "w", encoding="utf-8") as f:
        for p in prompts:
            f.write(p + "\n")


if __name__ == "__main__":
    main()