sweep_lm_data.py 1.65 KB
Newer Older
Myle Ott's avatar
Myle Ott committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python3


def set_data_based_on_shortname(args):

    def set_data(fmt, num_shards):
        if num_shards == 0:
            args.data = fmt.format(0)
        else:
            args.data = ':'.join([fmt.format(i) for i in range(num_shards)])

    # mmap datasets
    if args.data == 'CC-NEWS-en.v7.1':
        set_data('/private/home/myleott/data/data-bin/CC-NEWS-en.v7.1/shard{}', 10)
    elif args.data == 'fb_posts':
        set_data('/data/tmp/fb_posts.en.2018-2019.bpe.mmap-bin/shard{}', 100)
    elif args.data == 'fb_posts_gfs':
        set_data('/mnt/vol/gfsai-flash2-east/ai-group/users/myleott/fb_posts/fb_posts.en.2018-2019.bpe.mmap-bin/shard{}', 100)
    # old datasets
    elif args.data == 'CC-NEWS-en.v6':
        set_data('/private/home/myleott/data/data-bin/CC-NEWS-en.v6', 0)
    elif args.data == 'CC-NEWS-en.v9':
        set_data('/private/home/namangoyal/fairseq-py/data-bin/CC-NEWS-en.v9/shard{}', 100)
    elif args.data == 'bookwiki':
        set_data('/private/home/myleott/data/data-bin/bookwiki.10shards/shard{}', 10)
    elif args.data == 'bookwiki_full':
        set_data('/private/home/myleott/data/data-bin/bookwiki-bin', 0)
    elif args.data == 'fb_posts_old':
        set_data('/data/tmp/mono.english.public.2018-2019.shard{}.sents.bpe-bin', 100)
    elif args.data == 'fb_posts_gfs':
        set_data('/mnt/vol/gfsai-flash2-east/ai-group/users/myleott/fb_posts/en/mono.english.public.2018-2019.shard{}.sents.bpe-bin', 100)
    elif args.data == 'wmt19_en_news_docs':
        set_data('/private/home/myleott/data/data-bin/wmt19_en_news_docs/wmt19_en_news_docs.bpe.shard{}', 100)
    else:
        set_data(args.data, 0)

    return args