Commit c394d7d1 authored by “change”'s avatar “change”
Browse files

init

parents
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import fileinput
import sacremoses
def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument("files", nargs="*", help="input files")
args = parser.parse_args()
detok = sacremoses.MosesDetokenizer()
for line in fileinput.input(args.files, openhook=fileinput.hook_compressed):
print(
detok.detokenize(line.strip().split(" "))
.replace(" @", "")
.replace("@ ", "")
.replace(" =", "=")
.replace("= ", "=")
.replace(" – ", "–")
)
if __name__ == "__main__":
main()
ar_AR
cs_CZ
de_DE
en_XX
es_XX
et_EE
fi_FI
fr_XX
gu_IN
hi_IN
it_IT
ja_XX
kk_KZ
ko_KR
lt_LT
lv_LV
my_MM
ne_NP
nl_XX
ro_RO
ru_RU
si_LK
tr_TR
vi_VN
zh_CN
af_ZA
az_AZ
bn_IN
fa_IR
he_IL
hr_HR
id_ID
ka_GE
km_KH
mk_MK
ml_IN
mn_MN
mr_IN
pl_PL
ps_AF
pt_XX
sv_SE
sw_KE
ta_IN
te_IN
th_TH
tl_XX
uk_UA
ur_PK
xh_ZA
gl_ES
sl_SI
\ No newline at end of file
# Multilingual Translation
[[Multilingual Translation with Extensible Multilingual Pretraining and Finetuning, https://arxiv.org/abs/2008.00401]](https://arxiv.org/abs/2008.00401)
## Introduction
This work is for training multilingual translation models with multiple bitext datasets. This multilingual translation framework supports (see [[training section]](#Training) and [[finetuning section]](#Finetuning) for examples)
* temperature based sampling over unbalancing datasets of different translation directions
- --sampling-method' with
choices=['uniform', 'temperature', 'concat']
- --sampling-temperature
* configurable to automatically add source and/or target language tokens to source/target sentences using data which are prepared in the same way as bilignual training
- --encoder-langtok with choices=['src', 'tgt', None] to specify whether to add source or target language tokens to the source sentences
- --decoder-langtok (binary option) to specify whether to add target language tokens to the target sentences or not
* finetuning mBART pretrained models for multilingual translation
- --finetune-from-model to specify the path from which to load the pretrained model
## Preprocessing data
Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/master/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model.
You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/master/examples/translation#multilingual-translation).
## Training
```bash
lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
path_2_data=<set to data path>
lang_list=<a file which contains a list of languages separated by new lines>
fairseq-train $path_2_data \
--encoder-normalize-before --decoder-normalize-before \
--arch transformer --layernorm-embedding \
--task translation_multi_simple_epoch \
--sampling-method "temperature" \
--sampling-temperature 1.5 \
--encoder-langtok "src" \
--decoder-langtok \
--lang-dict "$lang_list" \
--lang-pairs "$lang_pairs" \
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
--max-tokens 1024 --update-freq 2 \
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
--seed 222 --log-format simple --log-interval 2
```
## Finetuning
We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/master/examples/mbart).
```bash
lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
path_2_data=<set to data path>
lang_list=<a file which contains a list of languages separated by new lines>
pretrained_model=<path to the pretrained model, e.g. mbart or another trained multilingual model>
fairseq-train $path_2_data \
--finetune-from-model $pretrained_model \
--encoder-normalize-before --decoder-normalize-before \
--arch transformer --layernorm-embedding \
--task translation_multi_simple_epoch \
--sampling-method "temperature" \
--sampling-temperature 1.5 \
--encoder-langtok "src" \
--decoder-langtok \
--lang-dict "$lang_list" \
--lang-pairs "$lang_pairs" \
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
--max-tokens 1024 --update-freq 2 \
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
--seed 222 --log-format simple --log-interval 2
```
## Generate
The following command uses the multilingual task (translation_multi_simple_epoch) to generate translation from $source_lang to $target_lang on the test dataset. During generaton, the source language tokens are added to source sentences and the target language tokens are added as the starting token to decode target sentences. Options --lang-dict and --lang-pairs are needed to tell the generation process the ordered list of languages and translation directions that the trained model are awared of; they will need to be consistent with the training.
```bash
model=<multilingual model>
source_lang=<source language>
target_lang=<target language>
fairseq-generate $path_2_data \
--path $model \
--task translation_multi_simple_epoch \
--gen-subset test \
--source-lang $source_lang \
--target-lang $target_lang
--sacrebleu --remove-bpe 'sentencepiece'\
--batch-size 32 \
--encoder-langtok "src" \
--decoder-langtok \
--lang-dict "$lang_list" \
--lang-pairs "$lang_pairs" > ${source_lang}_${target_lang}.txt
```
Fairseq will generate translation into a file {source_lang}_${target_lang}.txt with sacreblue at the end.
You can also use costomized tokenizer to compare the performance with the literature. For example, you get a tokenizer [here](https://github.com/rsennrich/wmt16-scripts) and do the following:
```bash
TOKENIZER=<path to a customized tokenizer for decoding evaluation>
TOK_CMD=<"$TOKENIZER $target_lang" or cat for sacrebleu>
cat {source_lang}_${target_lang}.txt | grep -P "^H" |sort -V |cut -f 3- |$TOK_CMD > ${source_lang}_${target_lang}.hyp
cat {source_lang}_${target_lang}.txt | grep -P "^T" |sort -V |cut -f 2- |$TOK_CMD > ${source_lang}_${target_lang}.ref
sacrebleu -tok 'none' -s 'none' ${source_lang}_${target_lang}.ref < ${source_lang}_${target_lang}.hyp
```
# mBART50 models
* [mMBART 50 pretrained model](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.pretrained.tar.gz).
* [mMBART 50 finetuned many-to-one](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.n1.tar.gz).
* [mMBART 50 finetuned one-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.1n.tar.gz).
* [mMBART 50 finetuned many-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.nn.tar.gz).
Please download and extract from the above tarballs. Each tarball contains
* The fairseq model checkpoint: model.pt
* The list of supported languages: ML50_langs.txt
* Sentence piece model: sentence.bpe.model
* Fairseq dictionary of each language: dict.{lang}.txt (please replace lang with a language specified in ML50_langs.txt)
To use the trained models,
* use the tool [binarize.py](./data_scripts/binarize.py) to binarize your data using sentence.bpe.model and dict.{lang}.txt, and copy the dictionaries to your data path
* then run the generation command:
```bash
path_2_data=<path to your binarized data with fairseq dictionaries>
model=<path_to_extracted_folder>/model.pt
lang_list=<path_to_extracted_folder>/ML50_langs.txt
source_lang=<source language>
target_lang=<target language>
fairseq-generate $path_2_data \
--path $model \
--task translation_multi_simple_epoch \
--gen-subset test \
--source-lang $source_lang \
--target-lang $target_lang
--sacrebleu --remove-bpe 'sentencepiece'\
--batch-size 32 \
--encoder-langtok "src" \
--decoder-langtok \
--lang-dict "$lang_list"
```
## Citation
```bibtex
@article{tang2020multilingual,
title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning},
author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan},
year={2020},
eprint={2008.00401},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
# Install dependency
```bash
pip install -r requirement.txt
```
# Download the data set
```bash
export WORKDIR_ROOT=<a directory which will hold all working files>
```
The downloaded data will be at $WORKDIR_ROOT/ML50
# preprocess the data
Install SPM [here](https://github.com/google/sentencepiece)
```bash
export WORKDIR_ROOT=<a directory which will hold all working files>
export SPM_PATH=<a path pointing to sentencepice spm_encode.py>
```
* $WORKDIR_ROOT/ML50/raw: extracted raw data
* $WORKDIR_ROOT/ML50/dedup: dedup data
* $WORKDIR_ROOT/ML50/clean: data with valid and test sentences removed from the dedup data
import shutil
import os, sys
from subprocess import check_call, check_output
import glob
import argparse
import shutil
import pathlib
import itertools
def call_output(cmd):
print(f"Executing: {cmd}")
ret = check_output(cmd, shell=True)
print(ret)
return ret
def call(cmd):
print(cmd)
check_call(cmd, shell=True)
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
SPM_PATH = os.environ.get('SPM_PATH', None)
if SPM_PATH is None or not SPM_PATH.strip():
print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...")
sys.exit(-1)
SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model'
SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt'
SPM_ENCODE = f'{SPM_PATH}'
if not os.path.exists(SPM_MODEL):
call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}")
if not os.path.exists(SPM_VOCAB):
call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}")
def get_data_size(raw):
cmd = f'wc -l {raw}'
ret = call_output(cmd)
return int(ret.split()[0])
def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None):
src, tgt = direction.split('-')
for split in splits:
src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}'
if os.path.exists(src_raw) and os.path.exists(tgt_raw):
cmd = f"""python {SPM_ENCODE} \
--model {model}\
--output_format=piece \
--inputs {src_raw} {tgt_raw} \
--outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """
print(cmd)
call(cmd)
def binarize_(
bpe_dir,
databin_dir,
direction, spm_vocab=SPM_VOCAB,
splits=['train', 'test', 'valid'],
):
src, tgt = direction.split('-')
try:
shutil.rmtree(f'{databin_dir}', ignore_errors=True)
os.mkdir(f'{databin_dir}')
except OSError as error:
print(error)
cmds = [
"fairseq-preprocess",
f"--source-lang {src} --target-lang {tgt}",
f"--destdir {databin_dir}/",
f"--workers 8",
]
if isinstance(spm_vocab, tuple):
src_vocab, tgt_vocab = spm_vocab
cmds.extend(
[
f"--srcdict {src_vocab}",
f"--tgtdict {tgt_vocab}",
]
)
else:
cmds.extend(
[
f"--joined-dictionary",
f"--srcdict {spm_vocab}",
]
)
input_options = []
if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"):
input_options.append(
f"--trainpref {bpe_dir}/train.bpe",
)
if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"):
input_options.append(f"--validpref {bpe_dir}/valid.bpe")
if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"):
input_options.append(f"--testpref {bpe_dir}/test.bpe")
if len(input_options) > 0:
cmd = " ".join(cmds + input_options)
print(cmd)
call(cmd)
def binarize(
databin_dir,
direction, spm_vocab=SPM_VOCAB, prefix='',
splits=['train', 'test', 'valid'],
pairs_per_shard=None,
):
def move_databin_files(from_folder, to_folder):
for bin_file in glob.glob(f"{from_folder}/*.bin") \
+ glob.glob(f"{from_folder}/*.idx") \
+ glob.glob(f"{from_folder}/dict*"):
try:
shutil.move(bin_file, to_folder)
except OSError as error:
print(error)
bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin"
bpe_dir = f"{BPE_DIR}/{direction}{prefix}"
if pairs_per_shard is None:
binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits)
move_databin_files(bpe_databin_dir, databin_dir)
else:
# binarize valid and test which will not be sharded
binarize_(
bpe_dir, bpe_databin_dir, direction,
spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"])
for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"):
path_strs = os.path.split(shard_bpe_dir)
shard_str = path_strs[-1]
shard_folder = f"{bpe_databin_dir}/{shard_str}"
databin_shard_folder = f"{databin_dir}/{shard_str}"
print(f'working from {shard_folder} to {databin_shard_folder}')
os.makedirs(databin_shard_folder, exist_ok=True)
binarize_(
shard_bpe_dir, shard_folder, direction,
spm_vocab=spm_vocab, splits=["train"])
for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"):
filename = os.path.split(test_data)[-1]
try:
os.symlink(test_data, f"{databin_shard_folder}/{filename}")
except OSError as error:
print(error)
move_databin_files(shard_folder, databin_shard_folder)
def load_langs(path):
with open(path) as fr:
langs = [l.strip() for l in fr]
return langs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50")
parser.add_argument("--raw-folder", default='raw')
parser.add_argument("--bpe-folder", default='bpe')
parser.add_argument("--databin-folder", default='databin')
args = parser.parse_args()
DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50'
RAW_DIR = f'{DATA_PATH}/{args.raw_folder}'
BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}'
DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}'
os.makedirs(BPE_DIR, exist_ok=True)
raw_files = itertools.chain(
glob.glob(f'{RAW_DIR}/train*'),
glob.glob(f'{RAW_DIR}/valid*'),
glob.glob(f'{RAW_DIR}/test*'),
)
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
for direction in directions:
prefix = ""
splits = ['train', 'valid', 'test']
try:
shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True)
os.mkdir(f'{BPE_DIR}/{direction}{prefix}')
os.makedirs(DATABIN_DIR, exist_ok=True)
except OSError as error:
print(error)
spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB
encode_spm(spm_model, direction=direction, splits=splits)
binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits)
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os, sys
import subprocess
import re
from subprocess import check_call, check_output
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ")
def run_eval_bleu(cmd):
output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip()
print(output)
bleu = -1.0
for line in output.strip().split('\n'):
m = BLEU_REGEX.search(line)
if m is not None:
bleu = m.groups()[0]
bleu = float(bleu)
break
return bleu
def check_data_test_bleu(raw_folder, data_lang_pairs):
not_matchings = []
for sacrebleu_set, src_tgts in data_lang_pairs:
for src_tgt in src_tgts:
print(f'checking test bleus for: {src_tgt} at {sacrebleu_set}')
src, tgt = src_tgt.split('-')
ssrc, stgt = src[:2], tgt[:2]
if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'):
# reversed direction may have different test set
test_src = f'{raw_folder}/test.{tgt}-{src}.{src}'
else:
test_src = f'{raw_folder}/test.{src}-{tgt}.{src}'
cmd1 = f'cat {test_src} | sacrebleu -t "{sacrebleu_set}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""'
test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}'
cmd2 = f'cat {test_tgt} | sacrebleu -t "{sacrebleu_set}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""'
bleu1 = run_eval_bleu(cmd1)
if bleu1 != 100.0:
not_matchings.append(f'{sacrebleu_set}:{src_tgt} source side not matching: {test_src}')
bleu2 = run_eval_bleu(cmd2)
if bleu2 != 100.0:
not_matchings.append(f'{sacrebleu_set}:{src_tgt} target side not matching: {test_tgt}')
return not_matchings
if __name__ == "__main__":
to_data_path = f'{WORKDIR_ROOT}/iwsltv2'
not_matching = check_data_test_bleu(
f'{to_data_path}/raw',
[
('iwslt17', ['en_XX-ar_AR', 'en_XX-ko_KR', 'ar_AR-en_XX', 'ko_KR-en_XX']),
('iwslt17', ['en_XX-it_IT', 'en_XX-nl_XX', 'it_IT-en_XX', 'nl_XX-en_XX']),
('iwslt17/tst2015', ['en_XX-vi_VN', "vi_VN-en_XX"]),
]
)
if len(not_matching) > 0:
print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import glob
import argparse
from utils.dedup import deup
import sys
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
def get_directions(folder):
raw_files = glob.glob(f'{folder}/train*')
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
return directions
def diff_list(lhs, rhs):
return set(lhs).difference(set(rhs))
def check_diff(
from_src_file, from_tgt_file,
to_src_file, to_tgt_file,
):
seen_in_from = set()
seen_src_in_from = set()
seen_tgt_in_from = set()
from_count = 0
with open(from_src_file, encoding='utf-8') as fsrc, \
open(from_tgt_file, encoding='utf-8') as ftgt:
for s, t in zip(fsrc, ftgt):
seen_in_from.add((s, t))
seen_src_in_from.add(s)
seen_tgt_in_from.add(t)
from_count += 1
common = 0
common_src = 0
common_tgt = 0
to_count = 0
seen = set()
with open(to_src_file, encoding='utf-8') as fsrc, \
open(to_tgt_file, encoding='utf-8') as ftgt:
for s, t in zip(fsrc, ftgt):
to_count += 1
if (s, t) not in seen:
if (s, t) in seen_in_from:
common += 1
if s in seen_src_in_from:
common_src += 1
seen_src_in_from.remove(s)
if t in seen_tgt_in_from:
common_tgt += 1
seen_tgt_in_from.remove(t)
seen.add((s, t))
return common, common_src, common_tgt, from_count, to_count
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--folder", type=str, required=True,
help="the data folder ")
parser.add_argument("--split", type=str, default='test',
help="split (valid, test) to check against training data")
parser.add_argument('--directions', type=str, default=None, required=False)
args = parser.parse_args()
if args.directions is None:
directions = set(get_directions(args.folder))
directions = sorted(directions)
else:
directions = args.directions.split(',')
directions = sorted(set(directions))
results = []
print(f'checking where {args.split} split data are in training')
print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')
for direction in directions:
src, tgt = direction.split('-')
from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}'
from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}'
if not os.path.exists(from_src_file):
# some test/valid data might in reverse directinos:
from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}'
from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}'
to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}'
to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}'
if not os.path.exists(to_src_file) or not os.path.exists(from_src_file):
continue
r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file)
results.append(r)
print(f'{direction}\t', '\t'.join(map(str, r)))
if __name__ == "__main__":
main()
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import argparse
import pandas as pd
import sys
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
def load_langs(path):
with open(path) as fr:
langs = [l.strip() for l in fr]
return langs
def load_sentences(raw_data, split, direction):
src, tgt = direction.split('-')
src_path = f"{raw_data}/{split}.{direction}.{src}"
tgt_path = f"{raw_data}/{split}.{direction}.{tgt}"
if os.path.exists(src_path) and os.path.exists(tgt_path):
return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())]
else:
return []
def swap_direction(d):
src, tgt = d.split('-')
return f'{tgt}-{src}'
def get_all_test_data(raw_data, directions, split='test'):
test_data = [
x
for dd in directions
for d in [dd, swap_direction(dd)]
for x in load_sentences(raw_data, split, d)
]
# all_test_data = {s for _, d in test_data for s in d}
all_test_data = {}
for lang, d in test_data:
for s in d:
s = s.strip()
lgs = all_test_data.get(s, set())
lgs.add(lang)
all_test_data[s] = lgs
return all_test_data, test_data
def check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train={}):
# src, tgt = direction.split('-')
print(f'check training data for {direction} in {src_path} and {tgt_path}')
size = 0
overlapped_size_counted_dup = 0
if not os.path.exists(tgt_path) or not os.path.exists(src_path):
return mess_up_train, size, overlapped_size_counted_dup
with open(src_path) as f, open(tgt_path) as g:
for src_line, tgt_line in zip(f, g):
s = src_line.strip()
t = tgt_line.strip()
size += 1
if s in all_test_data:
langs = mess_up_train.get(s, set())
langs.add(direction)
mess_up_train[s] = langs
overlapped_size_counted_dup += 1
if t in all_test_data:
langs = mess_up_train.get(t, set())
langs.add(direction)
mess_up_train[t] = langs
overlapped_size_counted_dup += 1
print(f'{direction}: size={size}, overlapped={overlapped_size_counted_dup}')
return mess_up_train, size, overlapped_size_counted_dup
def check_train_all(raw_data, directions, all_test_data):
mess_up_train = {}
data_sizes = {}
# raw_data = '~chau/data-bin/MineBART/multilingual_mined_100M/en_XX/et_EE-en_XX/all.{en_XX, et_EE}'
print(f'checking training data againsts # {len(all_test_data)} sentences')
print(f'example test data: ', [s for i, s in enumerate(all_test_data.keys()) if i < 10])
for direction in directions:
src, tgt = direction.split('-')
path = f'{raw_data}/en_XX/{direction}/all'
src_path = f'{path}.{src}'
tgt_path = f'{path}.{tgt}'
print(f'checking {src_path} {tgt_path}')
_, size, overlapped_size_counted_dup = check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train)
data_sizes[direction] = (size, overlapped_size_counted_dup)
return mess_up_train, data_sizes
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--folder", type=str, required=True,
help="the data folder ")
parser.add_argument("--test-data", type=str, required=True,
help="the test data folder ")
parser.add_argument('--directions', type=str, default=None, required=False)
args = parser.parse_args()
directions = args.directions.split(',')
directions = sorted(set(directions))
results = []
# print(f'checking where {args.split} split data are in training')
# print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')
raw_data = args.folder
all_test_data, test_data = get_all_test_data(args.test_data, directions, split='test')
mess_up_train, data_sizes = check_train_all(raw_data, directions, all_test_data)
print(data_sizes)
if __name__ == "__main__":
main()
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import glob
import argparse
from utils.dedup import deup
import sys
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--from-folder", type=str, required=True,
help="the data folder to be dedup")
parser.add_argument("--to-folder", type=str, required=True,
help="the data folder to save deduped data")
parser.add_argument('--directions', type=str, default=None, required=False)
args = parser.parse_args()
if args.directions is None:
raw_files = glob.glob(f'{args.from_folder}/train*')
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
else:
directions = args.directions.split(',')
directions = sorted(set(directions))
for direction in directions:
src, tgt = direction.split('-')
src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}'
tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}'
src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}'
tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}'
assert src_file != src_file_out
assert tgt_file != tgt_file_out
print(f'deduping {src_file}, {tgt_file}')
deup(src_file, tgt_file, src_file_out, tgt_file_out)
if __name__ == "__main__":
main()
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
# first run download_wmt20.sh; it will install a few useful tools for other scripts
# TODO: need to print out instructions on downloading a few files which requires manually authentication from the websites
bash ./download_wmt20.sh
python ./download_wmt19_and_before.py
bash ./download_wat19_my.sh
python ./download_ted_and_extract.py
bash ./download_lotus.sh
bash ./download_iitb.sh
bash ./download_af_xh.sh
# IWSLT downloading URLs have changed in between; TODO: fix them:
bash ./download_iwslt_and_extract.sh
# TODO: globalvoices URLs changed; need to be fixed
bash ./download_flores_data.sh
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# set -x -e
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
# put intermediate files
TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2
# output {train,valid,test} files to dest
DEST=${WORKDIR_ROOT}/ML50/raw
ROOT=${WORKDIR_ROOT}
UTILS=$PWD/utils
TMX2CORPUS="${UTILS}/tmx2corpus"
TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py"
mkdir -p $TMP_DIR
mkdir -p $DEST
mkdir -p $UTILS
function download_opus(){
src=$1
tgt=$2
subset=$3
ulr=$4
mkdir extract_$subset.$src-$tgt
pushd extract_$subset.$src-$tgt
if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then
wget $url -O "$subset.$src-$tgt.tmx.gz"
gzip -d "$subset.$src-$tgt.tmx.gz"
f=$subset.$src-$tgt.tmx
$TMX_TOOL $f
mv bitext.$src ../$subset.$src-$tgt.$src
mv bitext.$tgt ../$subset.$src-$tgt.$tgt
fi
popd
}
function concat_subsets(){
src=$1
tgt=$2
subsets=$3
src_train=raw_train.$src-$tgt.$src
tgt_train=raw_train.$src-$tgt.$tgt
> $src_train
> $tgt_train
for subset in $subsets; do
cat $subset.$src-$tgt.$src >> $src_train
cat $subset.$src-$tgt.$tgt >> $tgt_train
done
}
function get_seeded_random()
{
seed="$1"
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
</dev/zero 2>/dev/null
}
function split_train_valid(){
src=$1
tgt=$2
raw_src_train=raw_train.$src-$tgt.$src
raw_tgt_train=raw_train.$src-$tgt.$tgt
shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src
shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt
head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src
head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt
tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src
tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt
}
function copy2dst(){
lsrc=$1
ltgt=$2
src=${lsrc:0:2}
tgt=${ltgt:0:2}
cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc
cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt
cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc
cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt
}
#for xh-en
declare -A xh_en_urls
xh_en_urls=(
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz
[wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz
[memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz
[XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz
)
mkdir $TMP_DIR/xh-en
pushd $TMP_DIR/xh-en
for k in "${!xh_en_urls[@]}"
do
name=$k
url=${xh_en_urls[$k]}
echo "$name: $url"
download_opus xh en $name $ulr
done
concat_subsets xh en "${!xh_en_urls[@]}"
split_train_valid xh en
copy2dst xh_ZA en_XX
popd
##
#for af-en
declare -A af_en_urls
af_en_urls=(
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz
[QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz
[OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz
[SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz
)
mkdir $TMP_DIR/af-en
pushd $TMP_DIR/af-en
for k in "${!af_en_urls[@]}"
do
name=$k
url=${af_en_urls[$k]}
echo "$name: $url"
download_opus af en $name $ulr
done
concat_subsets af en "${!af_en_urls[@]}"
split_train_valid af en
copy2dst af_ZA en_XX
popd
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
set -e
set -o pipefail
SRC=en
SI_TGT=si
NE_TGT=ne
DESTDIR=${WORKDIR_ROOT}/ML50/raw/
ROOT=${WORKDIR_ROOT}/tmp
mkdir -p $ROOT
DATA=$ROOT/data
NE_ROOT=$DATA/all-clean-ne
SI_ROOT=$DATA/all-clean-si
mkdir -p $DATA $NE_ROOT $SI_ROOT
SI_OPUS_DATASETS=(
"$SI_ROOT/GNOME.en-si"
"$SI_ROOT/Ubuntu.en-si"
"$SI_ROOT/KDE4.en-si"
"$SI_ROOT/OpenSubtitles.en-si"
)
SI_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-si.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-si.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-si.txt.zip"
"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-si.txt.zip"
)
NE_OPUS_DATASETS=(
"$NE_ROOT/GNOME.en-ne"
"$NE_ROOT/Ubuntu.en-ne"
"$NE_ROOT/KDE4.en-ne"
)
NE_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip"
)
REMOVE_FILE_PATHS=()
# Download data
download_data() {
CORPORA=$1
URL=$2
if [ -f $CORPORA ]; then
echo "$CORPORA already exists, skipping download"
else
echo "Downloading $URL"
wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA
if [ -f $CORPORA ]; then
echo "$URL successfully downloaded."
else
echo "$URL not successfully downloaded."
rm -f $CORPORA
exit -1
fi
fi
}
# Example: download_opus_data $LANG_ROOT $TGT
download_opus_data() {
LANG_ROOT=$1
TGT=$2
if [ "$TGT" = "si" ]; then
URLS=("${SI_OPUS_URLS[@]}")
DATASETS=("${SI_OPUS_DATASETS[@]}")
else
URLS=("${NE_OPUS_URLS[@]}")
DATASETS=("${NE_OPUS_DATASETS[@]}")
fi
# Download and extract data
for ((i=0;i<${#URLS[@]};++i)); do
URL=${URLS[i]}
CORPORA=${DATASETS[i]}
download_data $CORPORA $URL
unzip -o $CORPORA -d $LANG_ROOT
REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE )
done
cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC
cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT
REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC )
REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT )
}
download_opus_data $SI_ROOT $SI_TGT
cp ${SI_OPUS_DATASETS[3]}.$SRC $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SRC
cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TGT
REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT )
download_opus_data $NE_ROOT $NE_TGT
# Download and extract Global Voices data
GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en"
GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz"
download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL
gunzip -Nf $GLOBAL_VOICES.gz
sed -ne 's?.*<source>\(.*\)</source>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT
sed -ne 's?.*<target[^>]*>\(.*\)</target>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC
REMOVE_FILE_PATHS+=( $GLOBAL_VOICES )
# Download and extract the bible dataset
BIBLE_TOOLS=bible-corpus-tools
XML_BIBLES=XML_Bibles
XML_BIBLES_DUP=XML_Bibles_dup
if [ ! -e $BIBLE_TOOLS ]; then
echo "Cloning bible-corpus-tools repository..."
git clone https://github.com/christos-c/bible-corpus-tools.git
fi
mkdir -p $BIBLE_TOOLS/bin $XML_BIBLES $XML_BIBLES_DUP
javac -cp "$BIBLE_TOOLS/lib/*" -d $BIBLE_TOOLS/bin $BIBLE_TOOLS/src/bible/readers/*.java $BIBLE_TOOLS/src/bible/*.java
download_data bible.tar.gz "https://github.com/christos-c/bible-corpus/archive/v1.2.1.tar.gz"
tar xvzf bible.tar.gz
cp bible-corpus-1.2.1/bibles/{Greek.xml,English.xml,Nepali.xml} $XML_BIBLES/
cp bible-corpus-1.2.1/bibles/{Greek.xml,English-WEB.xml,Nepali.xml} $XML_BIBLES_DUP/
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES_DUP
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES_DUP
cat $XML_BIBLES/aligned/*/English.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$SRC
cat $XML_BIBLES/aligned/*/Nepali.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$NE_TGT
cat $XML_BIBLES_DUP/aligned/*/English-WEB.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$SRC
cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_TGT
REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP )
# Download and extract the Penn Treebank dataset
NE_TAGGED=$ROOT/new_submissions_parallel_corpus_project_Nepal
NE_TAGGED_URL="http://www.cle.org.pk/Downloads/ling_resources/parallelcorpus/NepaliTaggedCorpus.zip"
EN_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.en.patch"
NE_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.ne.patch"
MOSES=mosesdecoder
MOSES_TOK=$MOSES/scripts/tokenizer
EN_PATCH_REGEX="{s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
NE_PATCH_REGEX="{s:\p{Cf}::g;s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
download_data $DATA/nepali-penn-treebank.$SRC.patch $EN_TAGGED_PATCH_URL
download_data $DATA/nepali-penn-treebank.$NE_TGT.patch $NE_TAGGED_PATCH_URL
download_data original.zip $NE_TAGGED_URL
unzip -o original.zip -d $ROOT
cat $NE_TAGGED/00.txt $NE_TAGGED/01.txt $NE_TAGGED/02.txt > $NE_TAGGED/nepali-penn-treebank.$SRC
cat $NE_TAGGED/00ne_revised.txt $NE_TAGGED/01ne_revised.txt $NE_TAGGED/02ne_revised.txt > $NE_TAGGED/nepali-penn-treebank.$NE_TGT
patch $NE_TAGGED/nepali-penn-treebank.$SRC -i $DATA/nepali-penn-treebank.$SRC.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$SRC
patch $NE_TAGGED/nepali-penn-treebank.$NE_TGT -i $DATA/nepali-penn-treebank.$NE_TGT.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT
if [ ! -e $MOSES ]; then
echo "Cloning moses repository..."
git clone https://github.com/moses-smt/mosesdecoder.git
fi
cat $NE_TAGGED/nepali-penn-treebank-patched.$SRC | \
perl -anpe "$EN_PATCH_REGEX" | \
$MOSES_TOK/tokenizer.perl -l $SRC | \
$MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$SRC
cat $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT | \
perl -CIO -anpe "$NE_PATCH_REGEX" | \
$MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$NE_TGT
# Download nepali dictionary data
NE_DICT=$NE_ROOT/dictionaries
download_data $NE_DICT "http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz"
tar xvzf $NE_DICT
cp dictionaries/dict.ne $NE_ROOT/dictionary.$NE_TGT-$SRC
REMOVE_FILE_PATHS+=( $NE_DICT dictionaries )
REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch )
# Remove the temporary files
for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do
rm -rf ${REMOVE_FILE_PATHS[i]}
done
# Copy the training data
si=si_LK
ne=ne_NP
en=en_XX
cat $SI_ROOT/GNOMEKDEUbuntu.en-si.si $SI_ROOT/OpenSubtitles2018.en-si.si > $DESTDIR/train.$si-$en.$si
cat $SI_ROOT/GNOMEKDEUbuntu.en-si.en $SI_ROOT/OpenSubtitles2018.en-si.en > $DESTDIR/train.$si-$en.$en
cat $NE_ROOT/bible_dup.en-ne.ne $NE_ROOT/bible.en-ne.ne $NE_ROOT/globalvoices.2018q4.ne-en.ne $NE_ROOT/GNOMEKDEUbuntu.en-ne.ne $NE_ROOT/nepali-penn-treebank.ne > $DESTDIR/train.$ne-$en.$ne
cat $NE_ROOT/bible_dup.en-ne.en $NE_ROOT/bible.en-ne.en $NE_ROOT/globalvoices.2018q4.ne-en.en $NE_ROOT/GNOMEKDEUbuntu.en-ne.en $NE_ROOT/nepali-penn-treebank.en > $DESTDIR/train.$ne-$en.$en
#Download the test sets
wget https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz
tar -xvzf wikipedia_en_ne_si_test_sets.tgz
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.ne $DESTDIR/valid.$ne-$en.$ne
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.en $DESTDIR/valid.$ne-$en.$en
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.si $DESTDIR/valid.$si-$en.$si
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.en $DESTDIR/valid.$si-$en.$en
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.ne $DESTDIR/devtest.$ne-$en.$ne
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.en $DESTDIR/devtest.$ne-$en.$en
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.si $DESTDIR/devtest.$si-$en.$si
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.en $DESTDIR/devtest.$si-$en.$en
cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.ne $DESTDIR/test.$ne-$en.$ne
cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.en $DESTDIR/test.$ne-$en.$en
cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.si $DESTDIR/test.$si-$en.$si
cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.en $DESTDIR/test.$si-$en.$en
rm -rf wikipedia_en_ne_si_test_sets.tgz wikipedia_en_ne_si_test_sets
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
IITB=$WORKDIR_ROOT/IITB
mkdir -p $IITB
pushd $IITB
wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz
tar -xvzf parallel.tgz
wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/dev_test.tgz
tar -xvzf dev_test.tgz
DESTDIR=${WORKDIR_ROOT}/ML50/raw/
cp parallel/IITB.en-hi.en $DESTDIR/train.hi_IN-en_XX.en_XX
cp parallel/IITB.en-hi.hi $DESTDIR/train.hi_IN-en_XX.hi_IN
cp dev_test/dev.en $DESTDIR/valid.hi_IN-en_XX.en_XX
cp dev_test/dev.hi $DESTDIR/valid.hi_IN-en_XX.hi_IN
cp dev_test/test.en $DESTDIR/test.hi_IN-en_XX.en_XX
cp dev_test/test.hi $DESTDIR/test.hi_IN-en_XX.hi_IN
popd
\ No newline at end of file
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#echo 'Cloning Moses github repository (for tokenization scripts)...'
#git clone https://github.com/moses-smt/mosesdecoder.git
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
data_root=${WORKDIR_ROOT}/iwsltv2
DESTDIR=${WORKDIR_ROOT}/ML50/raw
langs="ar_AR it_IT nl_XX ko_KR vi_VN"
echo "data_root: $data_root"
download_path=${data_root}/downloads
raw=${DESTDIR}
tmp=${data_root}/tmp
orig=${data_root}/orig
mkdir -p $download_path $orig $raw $tmp
#######################
download_iwslt(){
iwslt_key=$1
src=$2
tgt=$3
save_prefix=$4
pushd ${download_path}
if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then
wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz
[ $? -eq 0 ] && return 0
fi
popd
}
extract_iwslt(){
src=$1
tgt=$2
prefix=$3
pushd $orig
tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz
popd
}
generate_train(){
lsrc=$1
ltgt=$2
src=${lsrc:0:2}
tgt=${ltgt:0:2}
for ll in $lsrc $ltgt; do
l=${ll:0:2}
f="$orig/*/train.tags.$src-$tgt.$l"
f_raw=$raw/train.$lsrc-$ltgt.$ll
cat $f \
| grep -v '<url>' \
| grep -v '<talkid>' \
| grep -v '<keywords>' \
| grep -v '<speaker>' \
| grep -v '<reviewer' \
| grep -v '<translator' \
| grep -v '<doc' \
| grep -v '</doc>' \
| sed -e 's/<title>//g' \
| sed -e 's/<\/title>//g' \
| sed -e 's/<description>//g' \
| sed -e 's/<\/description>//g' \
| sed 's/^\s*//g' \
| sed 's/\s*$//g' \
> $f_raw
[ $? -eq 0 ] && echo "extracted $f to $f_raw"
done
return 0
}
convert_valid_test(){
src=$1
tgt=$2
for l in $src $tgt; do
echo "lang: ${l}"
for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do
fname=${o##*/}
f=$tmp/${fname%.*}
echo "$o => $f"
grep '<seg id' $o \
| sed -e 's/<seg id="[0-9]*">\s*//g' \
| sed -e 's/\s*<\/seg>\s*//g' \
| sed -e "s/\’/\'/g" \
> $f
echo ""
done
done
}
generate_subset(){
lsrc=$1
ltgt=$2
src=${lsrc:0:2}
tgt=${ltgt:0:2}
subset=$3
prefix=$4
for ll in $lsrc $ltgt; do
l=${ll:0:2}
f=$tmp/$prefix.${src}-${tgt}.$l
if [[ -f $f ]]; then
cp $f $raw/$subset.${lsrc}-$ltgt.${ll}
fi
done
}
#################
echo "downloading iwslt training and dev data"
# using multilingual for it, nl
download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo
download_iwslt "2017-01-trnted" ar en
download_iwslt "2017-01-trnted" en ar
download_iwslt "2017-01-trnted" ko en
download_iwslt "2017-01-trnted" en ko
download_iwslt "2015-01" vi en
download_iwslt "2015-01" en vi
echo "donwloading iwslt test data"
download_iwslt "2017-01-mted-test" it en "test."
download_iwslt "2017-01-mted-test" en it "test."
download_iwslt "2017-01-mted-test" nl en "test."
download_iwslt "2017-01-mted-test" en nl "test."
download_iwslt "2017-01-ted-test" ar en "test."
download_iwslt "2017-01-ted-test" en ar "test."
download_iwslt "2017-01-ted-test" ko en "test."
download_iwslt "2017-01-ted-test" en ko "test."
download_iwslt "2015-01-test" vi en "test."
download_iwslt "2015-01-test" en vi "test."
echo "extract training data tar balls"
extract_iwslt DeEnItNlRo DeEnItNlRo
extract_iwslt ar en
extract_iwslt en ar
extract_iwslt ko en
extract_iwslt en ko
extract_iwslt vi en
extract_iwslt en vi
echo "extracting iwslt test data"
for lang in $langs; do
l=${lang:0:2}
extract_iwslt $l en "test."
extract_iwslt en $l "test."
done
echo "convert dev and test data"
for lang in $langs; do
s_lang=${lang:0:2}
convert_valid_test $s_lang en
convert_valid_test en $s_lang
done
echo "creating training data into $raw"
for lang in $langs; do
generate_train $lang en_XX
generate_train en_XX $lang
done
echo "creating iwslt dev data into raw"
generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013"
generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013"
generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016"
generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016"
generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016"
generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016"
generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010"
generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010"
generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010"
generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010"
echo "creating iswslt test data into raw"
generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015"
generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015"
generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017"
generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017"
generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017"
generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017"
generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng"
generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng"
generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng"
generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng"
# normalze iwslt directions into x-en
pushd $raw
for lang in $langs; do
for split in test valid; do
x_en_f1=$split.$lang-en_XX.en_XX
x_en_f2=$split.$lang-en_XX.${lang}
en_x_f1=$split.en_XX-$lang.en_XX
en_x_f2=$split.en_XX-$lang.${lang}
if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then
echo "cp $en_x_f1 $x_en_f1"
cp $en_x_f1 $x_en_f1
fi
if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then
echo "cp $en_x_f2 $x_en_f2"
cp $en_x_f2 $x_en_f2
fi
done
done
popd
\ No newline at end of file
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
SRCDIR=$WORKDIR_ROOT/indic_languages_corpus
DESTDIR=${WORKDIR_ROOT}/ML50/raw/
mkdir -p $SRCDIR
mkdir -p $DESTDIR
cd $SRCDIR
wget http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_languages_corpus.tar.gz
tar -xvzf indic_languages_corpus.tar.gz
SRC_EXTRACT_DIR=$SRCDIR/indic_languages_corpus/bilingual
cp $SRC_EXTRACT_DIR/ml-en/train.ml $DESTDIR/train.ml_IN-en_XX.ml_IN
cp $SRC_EXTRACT_DIR/ml-en/train.en $DESTDIR/train.ml_IN-en_XX.en_XX
cp $SRC_EXTRACT_DIR/ml-en/dev.ml $DESTDIR/valid.ml_IN-en_XX.ml_IN
cp $SRC_EXTRACT_DIR/ml-en/dev.en $DESTDIR/valid.ml_IN-en_XX.en_XX
cp $SRC_EXTRACT_DIR/ml-en/test.ml $DESTDIR/test.ml_IN-en_XX.ml_IN
cp $SRC_EXTRACT_DIR/ml-en/test.en $DESTDIR/test.ml_IN-en_XX.en_XX
cp $SRC_EXTRACT_DIR/ur-en/train.ur $DESTDIR/train.ur_PK-en_XX.ur_PK
cp $SRC_EXTRACT_DIR/ur-en/train.en $DESTDIR/train.ur_PK-en_XX.en_XX
cp $SRC_EXTRACT_DIR/ur-en/dev.ur $DESTDIR/valid.ur_PK-en_XX.ur_PK
cp $SRC_EXTRACT_DIR/ur-en/dev.en $DESTDIR/valid.ur_PK-en_XX.en_XX
cp $SRC_EXTRACT_DIR/ur-en/test.ur $DESTDIR/test.ur_PK-en_XX.ur_PK
cp $SRC_EXTRACT_DIR/ur-en/test.en $DESTDIR/test.ur_PK-en_XX.en_XX
cp $SRC_EXTRACT_DIR/te-en/train.te $DESTDIR/train.te_IN-en_XX.te_IN
cp $SRC_EXTRACT_DIR/te-en/train.en $DESTDIR/train.te_IN-en_XX.en_XX
cp $SRC_EXTRACT_DIR/te-en/dev.te $DESTDIR/valid.te_IN-en_XX.te_IN
cp $SRC_EXTRACT_DIR/te-en/dev.en $DESTDIR/valid.te_IN-en_XX.en_XX
cp $SRC_EXTRACT_DIR/te-en/test.te $DESTDIR/test.te_IN-en_XX.te_IN
cp $SRC_EXTRACT_DIR/te-en/test.en $DESTDIR/test.te_IN-en_XX.en_XX
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import itertools
import os
import csv
from collections import defaultdict
from six.moves import zip
import io
import wget
import sys
from subprocess import check_call, check_output
# scripts and data locations
CWD = os.getcwd()
UTILS = f"{CWD}/utils"
MOSES = f"{UTILS}/mosesdecoder"
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
# please donwload mosesdecoder here:
detok_cmd = f'{MOSES}/scripts/tokenizer/detokenizer.perl'
def call(cmd):
print(f"Executing: {cmd}")
check_call(cmd, shell=True)
class MultiLingualAlignedCorpusReader(object):
"""A class to read TED talk dataset
"""
def __init__(self, corpus_path, delimiter='\t',
target_token=True, bilingual=True, corpus_type='file',
lang_dict={'source': ['fr'], 'target': ['en']},
eval_lang_dict=None, zero_shot=False,
detok=True,
):
self.empty_line_flag = 'NULL'
self.corpus_path = corpus_path
self.delimiter = delimiter
self.bilingual = bilingual
self.lang_dict = lang_dict
self.lang_set = set()
self.target_token = target_token
self.zero_shot = zero_shot
self.eval_lang_dict = eval_lang_dict
self.corpus_type = corpus_type
self.detok = detok
for list_ in self.lang_dict.values():
for lang in list_:
self.lang_set.add(lang)
self.data = dict()
self.data['train'] = self.read_aligned_corpus(split_type='train')
self.data['test'] = self.read_aligned_corpus(split_type='test')
self.data['dev'] = self.read_aligned_corpus(split_type='dev')
def read_data(self, file_loc_):
data_list = list()
with io.open(file_loc_, 'r', encoding='utf8') as fp:
for line in fp:
try:
text = line.strip()
except IndexError:
text = self.empty_line_flag
data_list.append(text)
return data_list
def filter_text(self, dict_):
if self.target_token:
field_index = 1
else:
field_index = 0
data_dict = defaultdict(list)
list1 = dict_['source']
list2 = dict_['target']
for sent1, sent2 in zip(list1, list2):
try:
src_sent = ' '.join(sent1.split()[field_index: ])
except IndexError:
src_sent = 'NULL'
if src_sent.find(self.empty_line_flag) != -1 or len(src_sent) == 0:
continue
elif sent2.find(self.empty_line_flag) != -1 or len(sent2) == 0:
continue
else:
data_dict['source'].append(sent1)
data_dict['target'].append(sent2)
return data_dict
def read_file(self, split_type, data_type):
return self.data[split_type][data_type]
def save_file(self, path_, split_type, data_type, lang):
tok_file = tok_file_name(path_, lang)
with io.open(tok_file, 'w', encoding='utf8') as fp:
for line in self.data[split_type][data_type]:
fp.write(line + '\n')
if self.detok:
de_tok(tok_file, lang)
def add_target_token(self, list_, lang_id):
new_list = list()
token = '__' + lang_id + '__'
for sent in list_:
new_list.append(token + ' ' + sent)
return new_list
def read_from_single_file(self, path_, s_lang, t_lang):
data_dict = defaultdict(list)
with io.open(path_, 'r', encoding='utf8') as fp:
reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
data_dict['source'].append(row[s_lang])
data_dict['target'].append(row[t_lang])
if self.target_token:
text = self.add_target_token(data_dict['source'], t_lang)
data_dict['source'] = text
return data_dict['source'], data_dict['target']
def read_aligned_corpus(self, split_type='train'):
data_dict = defaultdict(list)
iterable = []
s_list = []
t_list = []
if self.zero_shot:
if split_type == "train":
iterable = zip(self.lang_dict['source'], self.lang_dict['target'])
else:
iterable = zip(self.eval_lang_dict['source'], self.eval_lang_dict['target'])
elif self.bilingual:
iterable = itertools.product(self.lang_dict['source'], self.lang_dict['target'])
for s_lang, t_lang in iterable:
if s_lang == t_lang:
continue
if self.corpus_type == 'file':
split_type_file_path = os.path.join(self.corpus_path,
"all_talks_{}.tsv".format(split_type))
s_list, t_list = self.read_from_single_file(split_type_file_path,
s_lang=s_lang,
t_lang=t_lang)
data_dict['source'] += s_list
data_dict['target'] += t_list
new_data_dict = self.filter_text(data_dict)
return new_data_dict
def read_langs(corpus_path):
split_type_file_path = os.path.join(corpus_path, 'extracted',
"all_talks_dev.tsv")
with io.open(split_type_file_path, 'r', encoding='utf8') as fp:
reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
header = next(reader)
return [k for k in header.keys() if k != 'talk_name']
def extra_english(corpus_path, split):
split_type_file_path = os.path.join(corpus_path,
f"all_talks_{split}.tsv")
output_split_type_file_path = os.path.join(corpus_path,
f"all_talks_{split}.en")
with io.open(split_type_file_path, 'r', encoding='utf8') as fp, io.open(output_split_type_file_path, 'w', encoding='utf8') as fw:
reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
line = row['en']
fw.write(line + '\n')
de_tok(output_split_type_file_path, 'en')
def tok_file_name(filename, lang):
seps = filename.split('.')
seps.insert(-1, 'tok')
tok_file = '.'.join(seps)
return tok_file
def de_tok(tok_file, lang):
# seps = tok_file.split('.')
# seps.insert(-1, 'detok')
# de_tok_file = '.'.join(seps)
de_tok_file = tok_file.replace('.tok.', '.')
cmd = 'perl {detok_cmd} -l {lang} < {tok_file} > {de_tok_file}'.format(
detok_cmd=detok_cmd, tok_file=tok_file,
de_tok_file=de_tok_file, lang=lang[:2])
call(cmd)
def extra_bitex(
ted_data_path,
lsrc_lang,
ltrg_lang,
target_token,
output_data_path,
):
def get_ted_lang(lang):
long_langs = ['pt-br', 'zh-cn', 'zh-tw', 'fr-ca']
if lang[:5] in long_langs:
return lang[:5]
elif lang[:4] =='calv':
return lang[:5]
elif lang in ['pt_BR', 'zh_CN', 'zh_TW', 'fr_CA']:
return lang.lower().replace('_', '-')
return lang[:2]
src_lang = get_ted_lang(lsrc_lang)
trg_lang = get_ted_lang(ltrg_lang)
train_lang_dict={'source': [src_lang], 'target': [trg_lang]}
eval_lang_dict = {'source': [src_lang], 'target': [trg_lang]}
obj = MultiLingualAlignedCorpusReader(corpus_path=ted_data_path,
lang_dict=train_lang_dict,
target_token=target_token,
corpus_type='file',
eval_lang_dict=eval_lang_dict,
zero_shot=False,
bilingual=True)
os.makedirs(output_data_path, exist_ok=True)
lsrc_lang = lsrc_lang.replace('-', '_')
ltrg_lang = ltrg_lang.replace('-', '_')
obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
split_type='train', data_type='source', lang=src_lang)
obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
split_type='train', data_type='target', lang=trg_lang)
obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
split_type='test', data_type='source', lang=src_lang)
obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
split_type='test', data_type='target', lang=trg_lang)
obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
split_type='dev', data_type='source', lang=src_lang)
obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
split_type='dev', data_type='target', lang=trg_lang)
def bar_custom(current, total, width=80):
print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r')
def download_and_extract(download_to, extract_to):
url = 'http://phontron.com/data/ted_talks.tar.gz'
filename = f"{download_to}/ted_talks.tar.gz"
if os.path.exists(filename):
print(f'{filename} has already been downloaded so skip')
else:
filename = wget.download(url, filename, bar=bar_custom)
if os.path.exists(f'{extract_to}/all_talks_train.tsv'):
print(f'Already extracted so skip')
else:
extract_cmd = f'tar xzfv "{filename}" -C "{extract_to}"'
call(extract_cmd)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--ted_data_path', type=str, default=WORKDIR_ROOT, required=False)
parser.add_argument(
'--direction-list',
type=str,
# default=None,
#for ML50
default=(
"bn_IN-en_XX,he_IL-en_XX,fa_IR-en_XX,id_ID-en_XX,sv_SE-en_XX,pt_XX-en_XX,ka_GE-en_XX,ka_GE-en_XX,th_TH-en_XX,"
"mr_IN-en_XX,hr_HR-en_XX,uk_UA-en_XX,az_AZ-en_XX,mk_MK-en_XX,gl_ES-en_XX,sl_SI-en_XX,mn_MN-en_XX,"
#non-english directions
# "fr_XX-de_DE," # replaced with wmt20
# "ja_XX-ko_KR,es_XX-pt_XX,ru_RU-sv_SE,hi_IN-bn_IN,id_ID-ar_AR,cs_CZ-pl_PL,ar_AR-tr_TR"
),
required=False)
parser.add_argument('--target-token', action='store_true', default=False)
parser.add_argument('--extract-all-english', action='store_true', default=False)
args = parser.parse_args()
import sys
import json
# TED Talks data directory
ted_data_path = args.ted_data_path
download_to = f'{ted_data_path}/downloads'
extract_to = f'{ted_data_path}/extracted'
#DESTDIR=${WORKDIR_ROOT}/ML50/raw/
output_path = f'{ted_data_path}/ML50/raw'
os.makedirs(download_to, exist_ok=True)
os.makedirs(extract_to, exist_ok=True)
os.makedirs(output_path, exist_ok=True)
download_and_extract(download_to, extract_to)
if args.extract_all_english:
for split in ['train', 'dev', 'test']:
extra_english(ted_data_path, split)
exit(0)
if args.direction_list is not None:
directions = args.direction_list.strip().split(',')
directions = [tuple(d.strip().split('-', 1)) for d in directions if d]
else:
langs = read_langs(ted_data_path)
# directions = [
# '{}.{}'.format(src, tgt)
# for src in langs
# for tgt in langs
# if src < tgt
# ]
directions = [('en', tgt) for tgt in langs if tgt != 'en']
print(f'num directions={len(directions)}: {directions}')
for src_lang, trg_lang in directions:
print('--working on {}-{}'.format(src_lang, trg_lang))
extra_bitex(
extract_to,
src_lang,
trg_lang,
target_token=args.target_token,
output_data_path=output_path
)
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
SRCDIR=$WORKDIR_ROOT/indic_languages_corpus
DESTDIR=$WORKDIR_ROOT/ML50/raw
mkdir -p $SRCDIR
mkdir -p $DESTDIR
WAT_MY_EN=wat2020.my-en.zip
cd $SRCDIR
# please refer to http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/ for latest URL if the following url expired
#- The data used for WAT2020 are identical to those used in WAT2019.
wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/$WAT_MY_EN
unzip $WAT_MY_EN
SRC_EXTRACT_DIR=$SRCDIR/wat2020.my-en/alt
cp $SRC_EXTRACT_DIR/train.alt.en $DESTDIR/train.my_MM-en_XX.en_XX
cp $SRC_EXTRACT_DIR/train.alt.my $DESTDIR/train.my_MM-en_XX.my_MM
cp $SRC_EXTRACT_DIR/dev.alt.en $DESTDIR/valid.my_MM-en_XX.en_XX
cp $SRC_EXTRACT_DIR/dev.alt.my $DESTDIR/valid.my_MM-en_XX.my_MM
cp $SRC_EXTRACT_DIR/test.alt.en $DESTDIR/test.my_MM-en_XX.en_XX
cp $SRC_EXTRACT_DIR/test.alt.my $DESTDIR/test.my_MM-en_XX.my_MM
from typing import NamedTuple, List
from urllib.parse import urlparse
import os, sys
import subprocess
from subprocess import check_call, check_output
import glob
import wget
import re
import multiprocessing as mp
from functools import partial
import pathlib
from collections import OrderedDict
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
# scripts and data locations
CWD = os.getcwd()
UTILS = f"{CWD}/utils"
MOSES = f"{UTILS}/mosesdecoder"
SGM_TOOL = f'{MOSES}/scripts/ems/support/input-from-sgm.perl'
TMX2CORPUS = f"{UTILS}/tmx2corpus"
TMX_TOOL = f'python {TMX2CORPUS}/tmx2corpus.py'
to_data_path = f'{WORKDIR_ROOT}/wmt'
download_to = f'{to_data_path}/downloads'
manually_downloads = f'{to_data_path}/downloads'
extract_to = f'{to_data_path}/extracted'
#DESTDIR=${WORKDIR_ROOT}/ML50/raw/
raw_data = f'{WORKDIR_ROOT}/ML50/raw'
####
class DLDataset(NamedTuple):
name: str
train_urls: List[str]
valid_urls: List[str]
test_urls: List[str]
train_files_patterns: List[str] = []
valid_files_patterns: List[str] = []
test_files_patterns: List[str] = []
def bar_custom(current, total, width=80):
print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r')
def get_downloaded_file(dl_folder, url):
if isinstance(url, tuple):
url, f = url
else:
url_f = urlparse(url)
# f = os.path.split(url_f.path)[-1]
f = '_'.join(url_f.path.split('/')[1:])
return url, f"{dl_folder}/{f}"
def download_parts_and_combine(dl_folder, urls, filename):
parts = []
for url_record in urls:
url, part_file = get_downloaded_file(dl_folder, url_record)
if os.path.exists(part_file):
print(f'{part_file} has already been downloaded so skip')
else:
part_file = wget.download(url, part_file, bar=bar_custom)
parts.append(part_file)
def get_combine_cmd(parts):
#default as tar.gz.??
return f'cat {" ".join(parts)} > {filename}'
combine_cmd = get_combine_cmd(parts)
call(combine_cmd, debug=True)
return filename
def download_a_url(dl_folder, url):
url, filename = get_downloaded_file(dl_folder, url)
if os.path.exists(filename):
print(f'{filename} has already been downloaded so skip')
return filename
print(f'downloading {url} to {filename}')
if isinstance(url, list) or isinstance(url, tuple):
download_parts_and_combine(dl_folder, url, filename)
else:
wget.download(url, filename, bar=bar_custom)
print(f'dowloaded: {filename}')
return filename
def download_files(dl_folder, urls, completed_urls={}):
for url_record in urls:
url, _ = get_downloaded_file(dl_folder, url_record)
filename = download_a_url(dl_folder, url_record)
completed_urls[str(url)] = filename
return completed_urls
def check_need_manual_downalod(dl_folder, to_manually_download_urls):
to_be_manually_dowloaded = []
manually_completed_urls = {}
for url_record, instruction in to_manually_download_urls:
url, filename = get_downloaded_file(dl_folder, url_record)
if not os.path.exists(filename):
print(f'{url} need to be download manually, please download it manually following {instruction}; and copy it to {filename}')
to_be_manually_dowloaded.append((url, filename))
else:
manually_completed_urls[url] = filename
# if len(to_be_manually_dowloaded) > 0:
# raise ValueError('Missing files that need to be downloaded manually; stop the process now.')
return to_be_manually_dowloaded
def download_dataset(to_folder, dl_dataset, completed_urls={}):
download_files(to_folder, dl_dataset.train_urls, completed_urls)
download_files(to_folder, dl_dataset.valid_urls, completed_urls)
download_files(to_folder, dl_dataset.test_urls, completed_urls)
print('completed downloading')
return completed_urls
def call(cmd, debug=False):
if debug:
print(cmd)
check_call(cmd, shell=True)
def get_extract_name(file_path):
path = os.path.split(file_path)
return path[-1] + '_extract' #.split('.')[0]
def extract_file(downloaded_file, extract_folder, get_extract_name=get_extract_name, debug=False):
extract_name = get_extract_name(downloaded_file)
extract_to = f'{extract_folder}/{extract_name}'
os.makedirs(extract_to, exist_ok=True)
if os.path.exists(f'{extract_to}/DONE'):
print(f'{downloaded_file} has already been extracted to {extract_to} so skip')
return extract_to
def get_extract_cmd(filename):
if filename.endswith('.tgz') or filename.endswith('tar.gz'):
return f'tar xzfv {filename} -C {extract_to}'
elif filename.endswith('.gz.tar'):
return f'tar xfv {filename} -C {extract_to}; (cd {extract_to}; gzip -d *.gz; [ $? -eq 0 ] || gzip -d */*.gz)'
elif filename.endswith('.tar'):
return f'tar xfv {filename} -C {extract_to}'
elif filename.endswith('.gz'):
return f'cp {filename} {extract_to}; (cd {extract_to}; gzip -d *.gz)'
elif filename.endswith('.zip'):
return f'unzip {filename} -d {extract_to}'
extract_cmd = get_extract_cmd(downloaded_file)
print(f'extracting {downloaded_file}')
if isinstance(extract_cmd, list):
for c in extract_cmd:
call(c, debug=debug)
else:
call(extract_cmd, debug=debug)
call(f'echo DONE > {extract_to}/DONE')
return extract_to
def extract_all_files(
completed_urls, extract_folder,
get_extract_name=get_extract_name,
completed_extraction={},
debug=False):
extracted_folders = OrderedDict()
for url, downloaded_file in set(completed_urls.items()):
if downloaded_file in completed_extraction:
print(f'{downloaded_file} is already extracted; so skip')
continue
folder = extract_file(downloaded_file, extract_folder, get_extract_name, debug)
extracted_folders[url] = folder
return extracted_folders
def my_glob(folder):
for p in [f'{folder}/*', f'{folder}/*/*', f'{folder}/*/*/*']:
for f in glob.glob(p):
yield f
def sgm2raw(sgm, debug):
to_file = sgm[0:len(sgm) - len('.sgm')]
if os.path.exists(to_file):
debug and print(f'{sgm} already converted to {to_file}; so skip')
return to_file
cmd = f'{SGM_TOOL} < {sgm} > {to_file}'
call(cmd, debug)
return to_file
def tmx2raw(tmx, debug):
to_file = tmx[0:len(tmx) - len('.tmx')]
to_folder = os.path.join(*os.path.split(tmx)[:-1])
if os.path.exists(f'{to_folder}/bitext.en'):
debug and print(f'{tmx} already extracted to {to_file}; so skip')
return to_file
cmd = f'(cd {to_folder}; {TMX_TOOL} {tmx})'
call(cmd, debug)
return to_file
CZENG16_REGEX = re.compile(r'.*?data.plaintext-format/0[0-9]train$')
WMT19_WIKITITLES_REGEX = re.compile(r'.*?wikititles-v1.(\w\w)-en.tsv.gz')
TSV_REGEX = re.compile(r'.*?(\w\w)-(\w\w).tsv$')
def cut_wikitles(wiki_file, debug):
# different languages have different file names:
if wiki_file.endswith('wiki/fi-en/titles.fi-en'):
to_file1 = f'{wiki_file}.fi'
to_file2 = f'{wiki_file}.en'
BACKSLASH = '\\'
cmd1 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
cmd2 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
# elif WMT19_WIKITITLES_REGEX.match(wiki_file):
# src = WMT19_WIKITITLES_REGEX.match(wiki_file).groups()[0]
# to_file1 = f'{wiki_file}.{src}'
# to_file2 = f'{wiki_file}.en'
# cmd1 = f"cat {wiki_file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
# cmd2 = f"cat {wiki_file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
else:
return None
if os.path.exists(to_file1) and os.path.exists(to_file2):
debug and print(f'{wiki_file} already processed to {to_file1} and {to_file2}; so skip')
return wiki_file
call(cmd1, debug=debug)
call(cmd2, debug=debug)
return wiki_file
def cut_tsv(file, debug):
m = TSV_REGEX.match(file)
if m is None:
raise ValueError(f'{file} is not matching tsv pattern')
src = m.groups()[0]
tgt = m.groups()[1]
to_file1 = f'{file}.{src}'
to_file2 = f'{file}.{tgt}'
cmd1 = f"cat {file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
cmd2 = f"cat {file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
if os.path.exists(to_file1) and os.path.exists(to_file2):
debug and print(f'{file} already processed to {to_file1} and {to_file2}; so skip')
return file
call(cmd1, debug=debug)
call(cmd2, debug=debug)
return file
def convert_file_if_needed(file, debug):
if file.endswith('.sgm'):
return sgm2raw(file, debug)
elif file.endswith('.tmx'):
return tmx2raw(file, debug)
elif file.endswith('wiki/fi-en/titles.fi-en'):
return cut_wikitles(file, debug)
# elif WMT19_WIKITITLES_REGEX.match(file):
# return cut_wikitles(file, debug)
elif file.endswith('.tsv'):
return cut_tsv(file, debug)
elif CZENG16_REGEX.match(file):
return convert2czeng17(file, debug)
else:
return file
def convert_files_if_needed(extracted_foldrs, my_glob=my_glob, debug=False):
return {
url: list(sorted(set(convert_file_if_needed(f, debug)) for f in sorted(set(my_glob(folder)))))
for url, folder in extracted_foldrs.items()
}
def match_patt(file_path, file_pattern, src, tgt, lang):
return file_pattern.format(src=src, tgt=tgt, lang=lang) in file_path
def match_patts(file_path, file_patterns, src, tgt, lang):
for file_pattern in file_patterns:
params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if k in file_pattern}
matching = file_pattern.format(**params)
if isinstance(file_pattern, tuple):
pattern, directions = file_pattern
if f'{src}-{tgt}' in directions and matching in file_path:
return True
else:
if matching in file_path:
return True
return False
def extracted_glob(extracted_folder, file_patterns, src, tgt, lang):
def get_matching_pattern(file_pattern):
params = {
k: v
for k, v in [('src', src), ('tgt', tgt), ('lang', lang)]
if '{' + k + '}' in file_pattern
}
file_pattern = re.sub(r'{src:(.*?)}', r'\1' if lang == src else '', file_pattern)
file_pattern = re.sub(r'{tgt:(.*?)}', r'\1' if lang == tgt else '', file_pattern)
file_pattern = file_pattern.format(**params)
return file_pattern
for file_pattern in file_patterns:
if isinstance(file_pattern, tuple):
file_pattern, lang_pairs = file_pattern
if f'{src}-{tgt}' not in lang_pairs:
continue
# print('working on pattern: ', file_pattern, lang_pairs )
matching_pattern = get_matching_pattern(file_pattern)
if matching_pattern is None:
continue
glob_patterns = f'{extracted_folder}/{matching_pattern}'
# print('glob_patterns: ', glob_patterns)
for f in glob.glob(glob_patterns):
yield f
# for debug usage
def all_extracted_files(split, src, tgt, extracted_folders, split_urls):
def get_url(url):
if isinstance(url, tuple):
url, downloaded_file = url
return url
return [
f
for url in split_urls
for f in my_glob(extracted_folders[str(get_url(url))])
]
def concat_files(split, src, tgt, extracted_folders, split_urls, path_patterns, to_folder, debug=False):
# if debug:
# print('extracted files to be filtered by patterns: ',
# '\n\t'.join(sorted(all_extracted_files(split, src, tgt, extracted_folders, split_urls))))
for lang in [src, tgt]:
to_file = f'{to_folder}/{split}.{src}-{tgt}.{lang}'
s_src, s_tgt, s_lang = src.split('_')[0], tgt.split('_')[0], lang.split('_')[0]
files = []
for url in split_urls:
if isinstance(url, tuple):
url, downloaded_file = url
if str(url) not in extracted_folders:
print(f'warning: {url} not in extracted files')
for extracted_file in set(
extracted_glob(
extracted_folders[str(url)], path_patterns,
s_src, s_tgt, s_lang)):
files.append(extracted_file)
if len(files) == 0:
print('warning: ', f'No files found for split {to_file}')
continue
files = sorted(set(files))
print(f'concating {len(files)} files into {to_file}')
cmd = ['cat'] + [f'"{f}"' for f in files] + [f'>{to_file}']
cmd = " ".join(cmd)
call(cmd, debug=debug)
UTILS = os.path.join(pathlib.Path(__file__).parent, 'utils')
LID_MODEL = f'{download_to}/lid.176.bin'
LID_MULTI = f'{UTILS}/fasttext_multi_filter.py'
def lid_filter(split, src, tgt, from_folder, to_folder, debug=False):
if not os.path.exists(LID_MODEL):
call(f'wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {LID_MODEL}')
from_prefix = f'{from_folder}/{split}.{src}-{tgt}'
to_prefix = f'{to_folder}/{split}.{src}-{tgt}'
if os.path.exists(f'{from_prefix}.{src}') and os.path.exists(f'{from_prefix}.{tgt}'):
s_src, s_tgt = src.split('_')[0], tgt.split('_')[0]
cmd = (
f'python {LID_MULTI} --model {LID_MODEL} --inputs {from_prefix}.{src} {from_prefix}.{tgt} '
f'--langs {s_src} {s_tgt} --outputs {to_prefix}.{src} {to_prefix}.{tgt}'
)
print(f'filtering {from_prefix}')
call(cmd, debug=debug)
def concat_into_splits(dl_dataset, src, tgt, extracted_folders, to_folder, debug):
to_folder_tmp = f"{to_folder}_tmp"
os.makedirs(to_folder_tmp, exist_ok=True)
concat_files('train', src, tgt,
extracted_folders,
split_urls=dl_dataset.train_urls,
path_patterns=dl_dataset.train_files_patterns,
to_folder=to_folder_tmp, debug=debug)
lid_filter('train', src, tgt, to_folder_tmp, to_folder, debug)
concat_files('valid', src, tgt,
extracted_folders,
split_urls=dl_dataset.valid_urls,
path_patterns=dl_dataset.valid_files_patterns,
to_folder=to_folder, debug=debug)
concat_files('test', src, tgt,
extracted_folders,
split_urls=dl_dataset.test_urls,
path_patterns=dl_dataset.test_files_patterns,
to_folder=to_folder, debug=debug)
def download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=False):
pool = mp.Pool(processes=num_processes)
download_f = partial(download_a_url, dl_folder)
downloaded_files = pool.imap_unordered(download_f, urls)
pool.close()
pool.join()
BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ")
def run_eval_bleu(cmd):
output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip()
print(output)
bleu = -1.0
for line in output.strip().split('\n'):
m = BLEU_REGEX.search(line)
if m is not None:
bleu = m.groups()[0]
bleu = float(bleu)
break
return bleu
def check_wmt_test_bleu(raw_folder, wmt_lang_pairs):
not_matchings = []
for wmt, src_tgts in wmt_lang_pairs:
for src_tgt in src_tgts:
print(f'checking test bleus for: {src_tgt} at {wmt}')
src, tgt = src_tgt.split('-')
ssrc, stgt = src[:2], tgt[:2]
if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'):
# reversed direction may have different test set
test_src = f'{raw_folder}/test.{tgt}-{src}.{src}'
else:
test_src = f'{raw_folder}/test.{src}-{tgt}.{src}'
cmd1 = f'cat {test_src} | sacrebleu -t "{wmt}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""'
test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}'
cmd2 = f'cat {test_tgt} | sacrebleu -t "{wmt}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""'
bleu1 = run_eval_bleu(cmd1)
if bleu1 != 100.0:
not_matchings.append(f'{wmt}:{src_tgt} source side not matching: {test_src}')
bleu2 = run_eval_bleu(cmd2)
if bleu2 != 100.0:
not_matchings.append(f'{wmt}:{src_tgt} target side not matching: {test_tgt}')
return not_matchings
def download_and_extract(
to_folder, lang_pairs, dl_dataset,
to_manually_download_urls,
completed_urls={}, completed_extraction={},
debug=False):
dl_folder = f'{to_folder}/downloads'
extract_folder = f'{to_folder}/extracted'
raw_folder = f'{to_folder}/raw'
lid_filtered = f'{to_folder}/lid_filtered'
os.makedirs(extract_folder, exist_ok=True)
os.makedirs(raw_folder, exist_ok=True)
os.makedirs(lid_filtered, exist_ok=True)
to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls)
completed_urls = download_dataset(
dl_folder, dl_dataset, completed_urls)
if debug:
print('completed urls: ', completed_urls)
extracted_folders = extract_all_files(
completed_urls,
extract_folder=extract_folder,
completed_extraction=completed_extraction,
debug=debug)
if debug:
print('download files have been extracted to folders: ', extracted_folders)
converted_files = convert_files_if_needed(extracted_folders, debug=False)
for src_tgt in lang_pairs:
print(f'working on {dl_dataset.name}: {src_tgt}')
src, tgt = src_tgt.split('-')
concat_into_splits(dl_dataset,
src=src, tgt=tgt,
extracted_folders=extracted_folders,
to_folder=raw_folder, debug=debug)
print('completed data into: ', raw_folder)
def download_czang16(download_to, username=None):
wgets = [
f'wget --user={username} --password=czeng -P {download_to} http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar'
for i in range(10)]
cmds = []
for i, cmd in enumerate(wgets):
filename = f'{download_to}/data-plaintext-format.{i}.tar'
if os.path.exists(filename):
print(f'{filename} has already been downloaded; so skip')
continue
cmds.append(cmd)
if cmds and username is None:
raise ValueError('No czeng username is given; please register at http://ufal.mff.cuni.cz/czeng/czeng16 to obtain username to download')
for cmd in cmds:
call(cmd)
print('done with downloading czeng1.6')
def download_czeng17_script(download_to, extract_folder, debug=False):
url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip'
filename = f'{download_to}/convert_czeng16_to_17.pl.zip'
extract_to = f'{extract_folder}/{get_extract_name(filename)}'
script_path = f'{extract_to}/convert_czeng16_to_17.pl'
if not os.path.exists(script_path):
wget.download(url, filename, bar=bar_custom)
extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug)
return script_path
czeng17_script_path = ""
def convert2czeng17(file, debug):
en_file = f'{file}.en'
cs_file = f'{file}.cs'
if not os.path.exists(en_file) or not os.path.exists(cs_file):
cs_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f3 > {cs_file}'
en_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f4 > {en_file}'
call(cs_cmd, debug)
call(en_cmd, debug)
else:
print(f'already extracted: {en_file} and {cs_file}')
return file
def extract_czeng17(extract_folder, debug=False):
url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip'
filename = f'{download_to}/convert_czeng16_to_17.pl.zip'
extract_to = f'{extract_folder}/{get_extract_name(filename)}'
script_path = f'{extract_to}/convert_czeng16_to_17.pl'
if not os.path.exists(script_path):
wget.download(url, filename, bar=bar_custom)
extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug)
return script_path
#########
# definitions of wmt data sources
# for es-en
# Punctuation in the official test sets will be encoded with ASCII characters (not complex Unicode characters) as much as possible. You may want to normalize your system's output before submission. You are able able to use a rawer version of the test sets that does not have this normalization.
# script to normalize punctuation: http://www.statmt.org/wmt11/normalize-punctuation.perl
wmt13_es_en = DLDataset(
name='wmt13_es-en',
train_urls=[
'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
'http://www.statmt.org/wmt13/training-parallel-un.tgz',
'http://www.statmt.org/wmt13/training-parallel-nc-v8.tgz',
],
valid_urls=[
('http://www.statmt.org/wmt13/dev.tgz', 'wmt13_dev.tgz')
],
test_urls=[
('http://www.statmt.org/wmt13/test.tgz', 'wmt13_test.tgz')
],
train_files_patterns=[
('*/europarl-v7.{src}-{tgt}.{lang}', ['es-en']),
('*commoncrawl.{src}-{tgt}.{lang}', ['es-en']),
('*/news-commentary-v8.{src}-{tgt}.{lang}', ['es-en']),
('un/*undoc.2000.{src}-{tgt}.{lang}', ['es-en']),
] ,
valid_files_patterns=[
('dev/newstest2012.{lang}', ['es-en'])
],
test_files_patterns=[
('test/newstest*.{lang}', ['es-en'])
],
)
wmt14_de_fr_en = DLDataset(
name='wmt14_de_fr_en',
train_urls=[
'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
'http://www.statmt.org/wmt13/training-parallel-un.tgz',
'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz',
('http://www.statmt.org/wmt10/training-giga-fren.tar', 'training-giga-fren.gz.tar'), #it is actuall a gz.tar
],
valid_urls=[
('http://www.statmt.org/wmt14/dev.tgz', 'wmt14_dev.tgz'),
],
test_urls=[
('http://www.statmt.org/wmt14/test-full.tgz', 'wmt14_test_full.tgz'), # cleaned test sets
],
train_files_patterns=[
('*/europarl-v7.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
('*commoncrawl.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
('*/*news-commentary-v9.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
('un/undoc.2000.{src}-{tgt}.{lang}', ['fr-en']),
('*giga-{src}{tgt}*{lang}', ['fr-en'])
],
valid_files_patterns=[
('dev/newstest2013.{lang}', ['fr-en', 'de-en'])
],
test_files_patterns=[
('test-full/newstest*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['en-de', 'de-en', 'fr-en', 'en-fr']),
],
)
# pip install git+https://github.com/amake/tmx2corpus.git
wmt16_ro_en = DLDataset(
name='wmt16_ro-en',
train_urls=[
('http://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz', 'wmt16_training-parallel-ep-v8.tgz'),
('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-ro.tmx.gz', 'en-ro.tmx.gz'),
],
valid_urls=[
('http://data.statmt.org/wmt16/translation-task/dev-romanian-updated.tgz', 'wmt16_dev.tgz')
],
test_urls=[
('http://data.statmt.org/wmt16/translation-task/test.tgz', 'wmt16_test.tgz')
],
train_files_patterns=[
('*/*europarl-v8.{src}-{tgt}.{lang}', ['ro-en']),
('bitext.{lang}', ['ro-en']) #setimes from tmux
] ,
valid_files_patterns=[
('dev/newsdev2016*{src}{tgt}*.{lang}', ['ro-en', 'ro-en'])
],
test_files_patterns=[
('test/newstest*{src}{tgt}*.{lang}', ['ro-en', 'en-ro'])
],
)
cwmt_wmt_instruction = 'cwmt download instruction at: http://nlp.nju.edu.cn/cwmt-wmt'
wmt17_fi_lv_tr_zh_en_manual_downloads = [
# fake urls to have unique keys for the data
( ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), cwmt_wmt_instruction),
( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), cwmt_wmt_instruction),
( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), cwmt_wmt_instruction),
( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), cwmt_wmt_instruction),
( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), cwmt_wmt_instruction),
( ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), cwmt_wmt_instruction),
]
wmt17_fi_lv_tr_zh_en = DLDataset(
name='wmt17_fi_lv_tr_zh_en',
train_urls=[
('http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz', 'wmt17_training-parallel-ep-v8.tgz'),
'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz',
'http://www.statmt.org/wmt15/wiki-titles.tgz',
('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-tr.tmx.gz', 'en-tr.tmx.gz'),
('http://data.statmt.org/wmt17/translation-task/rapid2016.tgz', 'wmt17_rapid2016.tgz'),
'http://data.statmt.org/wmt17/translation-task/leta.v1.tgz',
'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz',
'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz',
(('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00',
'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01',), 'UNv1.0.en-zh.tar.gz'),
#manually download files:
('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'),
('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'),
('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'),
('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'),
('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'),
('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'),
],
valid_urls=[
('http://data.statmt.org/wmt17/translation-task/dev.tgz', 'wmt17_dev.tgz'),
],
test_urls=[
#NEW: Improved translations for zh test sets
('http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', 'wmt17_test_zh_en.tgz'),
('http://data.statmt.org/wmt17/translation-task/test.tgz', 'wmt17_test_others.tgz')
],
train_files_patterns=[
('casict*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
('casia*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
('dataum*/Book*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en']),
('neu*/NEU*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
('*/*UNv1.0.en-zh.{src:zh}{tgt:en}', ['zh-en']),
('training/*news-commentary-v12.{src}-{tgt}.{lang}', ['zh-en', ]),
('*/*europarl-v8.{src}-{tgt}.{lang}', ['fi-en', 'lv-en']),
('wiki/fi-en/titles.{src}-{tgt}.{lang}', ['fi-en', ]),
('rapid2016.{tgt}-{src}.{lang}', ['fi-en', 'lv-en']),
('*/leta.{lang}', ['lv-en']),
('*/dcep.{lang}', ['lv-en']),
('*/farewell.{lang}', ['lv-en']),
('bitext.{lang}', ['tr-en']),
] ,
valid_files_patterns=[
('dev/newsdev2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}',
[
'fi-en', 'lv-en', 'tr-en', 'zh-en',
'en-fi', 'en-lv', 'en-tr', 'en-zh'
]),
('dev/newstest2016*{src}{tgt}-{src:src}{tgt:ref}.{lang}',
[
'fi-en', 'tr-en',
'en-fi', 'en-tr',
]),
],
test_files_patterns=[
('test/newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
[
'fi-en', 'lv-en', 'tr-en',
'en-fi', 'en-lv', 'en-tr',
]),
('newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
[
'zh-en',
'en-zh'
]),
],
)
czeng_instruction = 'download instruction at: http://ufal.mff.cuni.cz/czeng/czeng16'
#alternative: use the prepared data but detokenize it?
wmt18_cs_et_en_manual_downloads = [
#for cs, need to register and download; Register and download CzEng 1.6.
#Better results can be obtained by using a subset of sentences, released under a new version name CzEng 1.7.
# ((f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar',
# f'data-plaintext-format.{i}.tar'), czeng_instruction)
# for i in range(10)
]
wmt18_cs_et_en = DLDataset(
name='wmt18_cs_et_en',
train_urls=[
'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz',
'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-cs.zipporah0-dedup-clean.tgz',
'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-et.zipporah0-dedup-clean.tgz',
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz',
('http://data.statmt.org/wmt18/translation-task/rapid2016.tgz', 'wmt18_rapid2016.tgz'),
# (tuple(
# (f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar',
# f'data-plaintext-format.{i}.tar')
# for i in range(10)
# ),
# 'czeng16_data_plaintext.gz.tar'),
],
valid_urls=[
('http://data.statmt.org/wmt18/translation-task/dev.tgz', 'wmt18_dev.tgz'),
],
test_urls=[
('http://data.statmt.org/wmt18/translation-task/test.tgz', 'wmt18_test.tgz'),
],
train_files_patterns=[
# ('*/*europarl-v7.{src}-{tgt}.{lang}', ['cs-en']),
('*/*europarl-v8.{src}-{tgt}.{lang}', ['et-en']),
# ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['cs-en', 'et-en']),
('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['et-en']),
# ('*commoncrawl.{src}-{tgt}.{lang}', ['cs-en']),
# ('*/news-commentary-v13.{src}-{tgt}.{lang}', ['cs-en']),
# ('data.plaintext-format/*train.{lang}', ['cs-en']),
('rapid2016.{tgt}-{src}.{lang}', ['et-en']),
] ,
valid_files_patterns=[
('dev/newsdev2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['et-en']),
# ('dev/newstest2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['cs-en'])
],
test_files_patterns=[
('test/newstest2018-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
# ['cs-en', 'et-en']),
['et-en']),
]
)
ru_en_yandex_instruction = 'Yandex Corpus download instruction at: https://translate.yandex.ru/corpus?lang=en'
wmt19_ru_gu_kk_lt_manual_downloads = [
(('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ru_en_yandex_instruction)
]
wmt19_ru_gu_kk_lt = DLDataset(
name='wmt19_ru_gu_kk_lt',
train_urls=[
'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz',
'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz',
'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz',
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14-wmt19.en-kk.tsv.gz',
'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz',
'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz',
'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz',
'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz',
(('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00',
'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01',
'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02',),
'wmt19_UNv1.0.en-ru.tar.gz'),
'https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-lt.tmx.zip',
('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'),
],
valid_urls=[
('http://data.statmt.org/wmt19/translation-task/dev.tgz', 'wmt19_dev.tgz'),
],
test_urls=[
('http://data.statmt.org/wmt19/translation-task/test.tgz', 'wmt19_test.tgz'),
],
train_files_patterns=[
('*europarl-v9.{src}-{tgt}.tsv.{lang}', ['lt-en']),
#paracrawl
('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['ru-en']),
('bitext.{lang}', ['lt-en',]),
('*commoncrawl.{src}-{tgt}.{lang}', ['ru-en',]),
('*news-commentary-v14-wmt19.{tgt}-{src}.tsv.{lang}', ['kk-en', ]),
('*news-commentary-v14.{tgt}-{src}.tsv.{lang}', ['ru-en']),
#yandex
('corpus.{tgt}_{src}.1m.{lang}', ['ru-en']),
('wikititles_v1_wikititles-v1.{src}-{tgt}.tsv.{lang}', ['ru-en', 'kk-en', 'lt-en', 'gu-en']),
('*/UNv1.0.{tgt}-{src}.{lang}', ['ru-en']),
#rapid
('bitext.{lang}', ['lt-en'])
],
valid_files_patterns=[
('dev/newsdev2019*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['gu-en', 'kk-en', 'lt-en']),
('dev/newstest2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en']),
],
test_files_patterns=[
('sgm/newstest2019-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
['ru-en', 'gu-en', 'kk-en', 'lt-en', 'en-ru', 'en-gu', 'en-kk', 'en-lt']),
]
)
#########
if __name__ == "__main__":
# speed up the downloads with multiple processing
dl_folder = f'{to_data_path}/downloads'
extract_folder = f'{to_data_path}/extracted'
urls = [
url
for dataset in [wmt13_es_en, wmt14_de_fr_en, wmt16_ro_en, wmt18_cs_et_en, wmt19_ru_gu_kk_lt]
for urls in [dataset.train_urls, dataset.valid_urls, dataset.test_urls]
for url in urls
]
urls = set(urls)
download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=True)
# check manually downlaods
to_manually_download_urls = (
wmt17_fi_lv_tr_zh_en_manual_downloads + wmt18_cs_et_en_manual_downloads + wmt19_ru_gu_kk_lt_manual_downloads
)
to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls)
if len(to_be_manually_dowloaded) > 0:
print('Missing files that need to be downloaded manually; stop the process now.')
exit(-1)
completed_urls = {}
completed_extraction = {}
def work_on_wmt(directions, wmt_data):
download_and_extract(
to_data_path,
directions,
wmt_data,
to_manually_download_urls=to_manually_download_urls,
completed_urls=completed_urls, completed_extraction=completed_extraction, debug=True)
work_on_wmt(
['es_XX-en_XX'],
wmt13_es_en,)
work_on_wmt(
[
'fr_XX-en_XX', 'en_XX-fr_XX',
# 'en_XX-de_DE', 'de_DE-en_XX',
],
wmt14_de_fr_en,)
work_on_wmt(
['ro_RO-en_XX', 'en_XX-ro_XX'],
wmt16_ro_en,)
work_on_wmt(
[
# 'zh_CN-en_XX',
'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX',
#in case the reversed directions have different train/valid/test data
# 'en_XX-zh_CN',
'en_XX-lv_LV', 'en_XX-fi_FI', 'en_XX-tr_TR',
],
wmt17_fi_lv_tr_zh_en, )
# czeng17_script_path = download_czeng17_script(download_to, extract_to, debug=False)
# cz_username = None
work_on_wmt(
[
# 'cs_CZ-en_XX',
'et_EE-en_XX'],
wmt18_cs_et_en,)
work_on_wmt(
[
# 'ru_RU-en_XX', 'en_XX-ru_RU',
'gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX',
#in case the reversed directions have different train/valid/test data
'en_XX-gu_IN', 'en_XX-kk_KZ', 'en_XX-lt_LT'
],
wmt19_ru_gu_kk_lt,)
not_matching = check_wmt_test_bleu(
f'{to_data_path}/raw',
[
('wmt13', ['es_XX-en_XX']),
('wmt14/full', ['fr_XX-en_XX',]),
('wmt16', ['ro_RO-en_XX',]),
# ('wmt17/improved', ['zh_CN-en_XX']),
('wmt17', [ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX']),
('wmt18', ['cs_CZ-en_XX', 'et_EE-en_XX']),
('wmt19', ['gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX']),
#'ru_RU-en_XX',
]
)
if len(not_matching) > 0:
print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching))
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
set -x -e
# TODO update the workdir and dest dir name
# put fasttext model
WORKDIR=$WORKDIR_ROOT
# put intermediate files
TMP_DIR=$WORKDIR_ROOT/tmp/tmp_wmt20_lowres_download
# output {train,valid,test} files to dest
DEST=$WORKDIR_ROOT/ML50/raw
UTILS=$PWD/utils
# per dataset locations
COMMONCRAWL_DIR=$TMP_DIR/commoncrawl
YANDEX_CORPUS=$WORKDIR_ROOT/wmt20/official/ru/yandex/1mcorpus.zip
# unzipped
CZENG_CORPUS=$WORKDIR_ROOT/wmt20/official/cs/czeng/czeng20-train
CCMT_DIR=$WORKDIR_ROOT/wmt20/official/zh/ccmt/parallel
download_and_select() {
SUBFOLDER=$1
URL=$2
UNCOMPRESS_CMD=$3
LANG=$4
INPUT_FILEPATH=$5
if [[ $# -gt 5 ]]; then
LANG_COL=$6
EN_COL=$7
fi
mkdir -p $SUBFOLDER
cd $SUBFOLDER
wget -nc --content-disposition $URL
$UNCOMPRESS_CMD
if [[ $# -gt 5 ]]; then
cut -f$LANG_COL $INPUT_FILEPATH > $INPUT_FILEPATH.$LANG
cut -f$EN_COL $INPUT_FILEPATH > $INPUT_FILEPATH.en
fi
cd ..
ln -sf $SUBFOLDER/$INPUT_FILEPATH.$LANG $SUBFOLDER.$LANG
ln -sf $SUBFOLDER/$INPUT_FILEPATH.en $SUBFOLDER.en
}
prepare_lid() {
pip install fasttext
# TODO specify global workdir
MODEL=$WORKDIR/fasttext/lid.176.bin
LID_MULTI=$UTILS/fasttext_multi_filter.py
if [ ! -f "$MODEL" ]; then
echo "downloading fasttext lid model..."
mkdir -p $WORKDIR/fasttext
wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $MODEL
fi
}
prepare_moses() {
pushd $UTILS
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
popd
}
lid_filter() {
# TODO specify global workdir
MODEL=$WORKDIR/fasttext/lid.176.bin
LID_MULTI=$UTILS/fasttext_multi_filter.py
prepare_lid
SRC=$1
SRC_FILE=$2
SRC_OUTPUT=$3
TGT=$4
TGT_FILE=$5
TGT_OUTPUT=$6
python $LID_MULTI --model $MODEL --inputs $SRC_FILE $TGT_FILE --langs $SRC $TGT --outputs $SRC_OUTPUT $TGT_OUTPUT
}
prepare_ja_ted() {
mkdir -p ted
cd ted
wget -nc https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz
tar -zxvf en-ja.tgz
cat en-ja/train.tags.en-ja.en | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.en
cat en-ja/train.tags.en-ja.ja | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.ja
cd ..
ln -sf ted/en-ja/train.en-ja.ja ted.ja
ln -sf ted/en-ja/train.en-ja.en ted.en
}
prepare_ja() {
OUTPUT_DIR=$TMP_DIR/ja
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select paracrawl "http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/en-ja.tar.gz" "tar -zxvf en-ja.tar.gz" ja en-ja/en-ja.bicleaner05.txt 4 3 &
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ja.tsv.gz" "gunzip -f news-commentary-v15.en-ja.tsv.gz" ja news-commentary-v15.en-ja.tsv 2 1 &
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ja-en.tsv.gz" "gunzip -f wikititles-v2.ja-en.tsv.gz" ja wikititles-v2.ja-en.tsv 1 2 &
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ja.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ja.langid.tsv.gz" ja WikiMatrix.v1.en-ja.langid.tsv 3 2 &
download_and_select subtitle "https://nlp.stanford.edu/projects/jesc/data/split.tar.gz" "tar -zxvf split.tar.gz" ja split/train 2 1 &
download_and_select kftt "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" "tar -zxvf kftt-data-1.0.tar.gz" ja kftt-data-1.0/data/orig/kyoto-train &
prepare_ja_ted &
# ted data needs to
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.ja" | sort -V | xargs cat > all.ja
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter ja all.ja $DEST/train.ja_XX-en_XX.ja_XX en all.en $DEST/train.ja_XX-en_XX.en_XX
}
prepare_ta() {
OUTPUT_DIR=$TMP_DIR/ta
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ta-en.tsv.gz" "gunzip -f wikititles-v2.ta-en.tsv.gz" ta wikititles-v2.ta-en.tsv 1 2 &
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ta.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ta.langid.tsv.gz" ta WikiMatrix.v1.en-ta.langid.tsv 3 2 &
download_and_select pmindia "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.ta-en.tsv" "" ta pmindia.v1.ta-en.tsv 2 1 &
download_and_select tanzil "https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-ta.txt.zip" "unzip en-ta.txt.zip" ta Tanzil.en-ta &
download_and_select pib "http://preon.iiit.ac.in/~jerin/resources/datasets/pib-v0.tar" "tar -xvf pib-v0.tar" ta pib/en-ta/train &
download_and_select mkb "http://preon.iiit.ac.in/~jerin/resources/datasets/mkb-v0.tar" "tar -xvf mkb-v0.tar" ta mkb/en-ta/mkb &
download_and_select ufal "http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz" "tar -zxvf en-ta-parallel-v2.tar.gz" ta en-ta-parallel-v2/corpus.bcn.train &
wait
# need special handling for nlpc
mkdir -p nlpc
cd nlpc
wget -nc https://raw.githubusercontent.com/nlpc-uom/English-Tamil-Parallel-Corpus/master/En-Ta%20Corpus/En-Ta%20English.txt
wget -nc https://github.com/nlpc-uom/English-Tamil-Parallel-Corpus/raw/master/En-Ta%20Corpus/En-Ta%20Tamil.txt
tail -n +4 "En-Ta English.txt" > en-ta.en
tail -n +4 "En-Ta Tamil.txt" > en-ta.ta
cd ..
ln -sf nlpc/en-ta.en nlpc.en
ln -sf nlpc/en-ta.ta nlpc.ta
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.ta" | sort -V | xargs cat > all.ta
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter ta all.ta $DEST/train.ta_IN-en_XX.ta_IN en all.en $DEST/train.ta_IN-en_XX.en_XX
}
prepare_iu() {
OUTPUT_DIR=$TMP_DIR/iu
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select nh "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" "tar -zxvf Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0.1.tgz" iu Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/NunavutHansard > /dev/null &
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.iu-en.tsv.gz" "gunzip -f wikititles-v2.iu-en.tsv.gz" iu wikititles-v2.iu-en.tsv 1 2 &
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.iu" | sort -V | xargs cat | nh/Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/scripts/normalize-iu-spelling.pl > all.iu
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
paste all.iu all.en | awk -F $'\t' '$1!=""&&$2!=""' > all.iuen
cut -f1 all.iuen > $DEST/train.iu_CA-en_XX.iu_CA
cut -f2 all.iuen > $DEST/train.iu_CA-en_XX.en_XX
}
prepare_km() {
OUTPUT_DIR=$TMP_DIR/km
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-km.xz" "unxz wmt20-sent.en-km.zx" km wmt20-sent.en-km 2 1 &
# km-parallel has multiple sets, concat all of them together
mkdir -p opus
cd opus
wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/km-parallel.tgz"
tar -zxvf km-parallel.tgz
find ./km-parallel -maxdepth 1 -name "*.km" | sort -V | xargs cat > opus.km
find ./km-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
cd ..
ln -sf opus/opus.km .
ln -sf opus/opus.en .
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.km" | sort -V | xargs cat > all.km
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter km all.km $DEST/train.km_KH-en_XX.km_KH en all.en $DEST/train.km_KH-en_XX.en_XX
}
prepare_ps() {
OUTPUT_DIR=$TMP_DIR/ps
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz" "unxz wmt20-sent.en-ps.xz" ps wmt20-sent.en-ps 2 1 &
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ps-en.tsv.gz" "gunzip -f wikititles-v2.ps-en.tsv.gz" ps wikititles-v2.ps-en.tsv 1 2 &
# ps-parallel has multiple sets, concat all of them together
mkdir -p opus
cd opus
wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz"
tar -zxvf ps-parallel.tgz
find ./ps-parallel -maxdepth 1 -name "*.ps" | sort -V | xargs cat > opus.ps
find ./ps-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
cd ..
ln -sf opus/opus.ps opus.ps
ln -sf opus/opus.en opus.en
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.ps" | sort -V | xargs cat > all.ps
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter ps all.ps $DEST/train.ps_AF-en_XX.ps_AF en all.en $DEST/train.ps_AF-en_XX.en_XX
}
download_commoncrawl() {
mkdir -p $COMMONCRAWL_DIR
cd $COMMONCRAWL_DIR
wget -nc "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
tar -zxvf training-parallel-commoncrawl.tgz
}
link_commoncrawl() {
LANG=$1
ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.en commoncrawl.en
ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.$LANG commoncrawl.$LANG
}
strip_xlf() {
INPUT_FILE=$1
SRC=$2
TGT=$3
grep '<source xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$SRC
grep '<target xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$TGT
}
download_and_process_tilde() {
URL=$1
UNCOMPRESS_CMD=$2
FILENAME=$3
LANG=$4
PROCESS_CMD=$5
mkdir -p tilde
cd tilde
wget -nc $URL
$UNCOMPRESS_CMD
echo "executing cmd"
echo $PROCESS_CMD
$PROCESS_CMD
cd ..
ln -sf tilde/$FILENAME.$LANG tilde.$LANG
ln -sf tilde/$FILENAME.en tilde.en
}
prepare_cs() {
OUTPUT_DIR=$TMP_DIR/cs
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
#download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz" "gunzip europarl-v10.cs-en.tsv.gz" cs europarl-v10.cs-en.tsv 1 2 &
#download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-cs.txt.gz" "gunzip en-cs.txt.gz" cs en-cs.txt 2 1 &
#link_commoncrawl cs
#download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.cs-en.tsv.gz" "gunzip news-commentary-v15.cs-en.tsv.gz" cs news-commentary-v15.cs-en.tsv 1 2 &
#download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.cs-en.tsv.gz" "gunzip wikititles-v2.cs-en.tsv.gz" cs wikititles-v2.cs-en.tsv 1 2 &
#download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.cs-en.xlf.gz" "gunzip RAPID_2019.cs-en.xlf.gz" RAPID_2019.cs-en.xlf cs "strip_xlf RAPID_2019.cs-en.xlf cs en" &
#download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.cs-en.langid.tsv.gz" "gunzip WikiMatrix.v1.cs-en.langid.tsv.gz" cs WikiMatrix.v1.cs-en.langid.tsv 2 3 &
#wait
# remove previous results
#rm -f all.??
#find ./ -maxdepth 1 -name "*.cs" | sort -V | xargs cat > all.cs
#find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
if [ -z $CZENG_CORPUS ] ;
then
echo "Please download CZENG_CORPUS manually and place them at $CZENG_CORPUS. Exitting..."
exit
fi
cat $CZENG_CORPUS | sed '/^$/d' | cut -f5 > all.cs
cat $CZENG_CORPUS | sed '/^$/d' | cut -f6 > all.en
lid_filter cs all.cs $DEST/train.cs_CZ-en_XX.cs_CZ en all.en $DEST/train.cs_CZ-en_XX.en_XX
}
prepare_de() {
OUTPUT_DIR=$TMP_DIR/de
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz" "gunzip europarl-v10.de-en.tsv.gz" de europarl-v10.de-en.tsv 1 2 &
download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-de.txt.gz" "gunzip en-de.txt.gz" de en-de.txt 2 1 &
link_commoncrawl de
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.de-en.tsv.gz" "gunzip news-commentary-v15.de-en.tsv.gz" de news-commentary-v15.de-en.tsv 1 2 &
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.de-en.tsv.gz" "gunzip wikititles-v2.de-en.tsv.gz" de wikititles-v2.de-en.tsv 1 2 &
download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.de-en.xlf.gz" "gunzip RAPID_2019.de-en.xlf.gz" RAPID_2019.de-en.xlf de "strip_xlf RAPID_2019.de-en.xlf de en" &
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.de-en.langid.tsv.gz" "gunzip WikiMatrix.v1.de-en.langid.tsv.gz" de WikiMatrix.v1.de-en.langid.tsv 2 3 &
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.de" | sort -V | xargs cat > all.de
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter de all.de $DEST/train.de_DE-en_XX.de_DE en all.en $DEST/train.de_DE-en_XX.en_XX
}
prepare_tmx() {
TMX_FILE=$1
git clone https://github.com/amake/TMX2Corpus $UTILS/tmx2corpus
pip install tinysegmenter
python $UTILS/tmx2corpus/tmx2corpus.py $TMX_FILE
}
prepare_pl() {
OUTPUT_DIR=$TMP_DIR/pl
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
# download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.pl-en.tsv.gz" "gunzip europarl-v10.pl-en.tsv.gz" pl europarl-v10.pl-en.tsv 1 2 &
# download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-pl.txt.gz" "gunzip en-pl.txt.gz" pl en-pl.txt 2 1 &
# download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.pl-en.tsv.gz" "gunzip wikititles-v2.pl-en.tsv.gz" pl wikititles-v2.pl-en.tsv 1 2 &
download_and_select tilde "https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2019.en-pl.tmx.zip" "gunzip rapid2019.en-pl.tmx.zip" bitext pl "prepare_tmx RAPID_2019.UNIQUE.en-pl.tmx" &
# download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-pl.langid.tsv.gz" "gunzip WikiMatrix.v1.en-pl.langid.tsv.gz" pl WikiMatrix.v1.en-pl.langid.tsv 3 2 &
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.pl" | sort -V | xargs cat > all.pl
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter pl all.pl $DEST/train.pl_PL-en_XX.pl_PL en all.en $DEST/train.pl_PL-en_XX.en_XX
}
prepare_uncorpus() {
$URLS=$1
$FILES=$2
mkdir -p uncorpus
cd uncorpus
for URL in $URLS; do
wget -nc $URL
done
cat $FILES > uncorpus.tar.gz
tar -zxvf uncorpus.tar.gz
cd ..
ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.$LANG uncorpus.$LANG
ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.en uncorpus.en
}
prepare_yandex() {
mkdir -p yandex
cd yandex
unzip $YANDEX_CORPUS ./
cd ..
ln -s yandex/corpus.en_ru.1m.en yandex.en
ln -s yandex/corpus.en_ru.1m.ru yandex.ru
}
prepare_ru() {
OUTPUT_DIR=$TMP_DIR/ru
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" "tar -zxvf paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" ru paracrawl-release1.en-ru.zipporah0-dedup-clean &
link_commoncrawl ru
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ru.tsv.gz" "gunzip news-commentary-v15.en-ru.tsv.gz" ru news-commentary-v15.en-ru.tsv 2 1 &
prepare_yandex &
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ru-en.tsv.gz" "gunzip wikititles-v2.ru-en.tsv.gz" ru wikititles-v2.ru-en.tsv 1 2 &
prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" "UNv1.0.en-ru.tar.gz.00 UNv1.0.en-ru.tar.gz.01 UNv1.0.en-ru.tar.gz.02" &
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ru.langid.tsv.gz" "gunzip WikiMatrix.v1.en-ru.langid.tsv.gz" ru WikiMatrix.v1.en-ru.langid.tsv 3 2 &
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.ru" | sort -V | xargs cat > all.ru
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter ru all.ru $DEST/train.ru_RU-en_XX.ru_RU en all.en $DEST/train.ru_RU-en_XX.en_XX
}
prepare_ccmt() {
mkdir -p ccmt
cd ccmt
# assume ccmt data is already unzipped under CCMT_DIR folder
cat $CCMT_DIR/datum2017/Book*_cn.txt | sed 's/ //g' > datum2017.detok.zh
cat $CCMT_DIR/datum2017/Book*_en.txt > datum2017.detok.en
cat $CCMT_DIR/casict2011/casict-A_ch.txt $CCMT_DIR/casict2011/casict-B_ch.txt $CCMT_DIR/casict2015/casict2015_ch.txt $CCMT_DIR/datum2015/datum_ch.txt $CCMT_DIR/neu2017/NEU_cn.txt datum2017.detok.zh > ccmt.zh
cat $CCMT_DIR/casict2011/casict-A_en.txt $CCMT_DIR/casict2011/casict-B_en.txt $CCMT_DIR/casict2015/casict2015_en.txt $CCMT_DIR/datum2015/datum_en.txt $CCMT_DIR/neu2017/NEU_en.txt datum2017.detok.en > ccmt.en
cd ..
ln -sf ccmt/ccmt.zh ccmt.zh
ln -sf ccmt/ccmt.en ccmt.en
}
prepare_zh() {
OUTPUT_DIR=$TMP_DIR/zh
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-zh.tsv.gz" "gunzip news-commentary-v15.en-zh.tsv.gz" zh news-commentary-v15.en-zh.tsv 2 1 &
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.zh-en.tsv.gz" "gunzip wikititles-v2.zh-en.tsv.gz" zh wikititles-v2.zh-en.tsv 1 2 &
prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" "UNv1.0.en-zh.tar.gz.00 UNv1.0.en-zh.tar.gz.01" &
prepare_ccmt &
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-zh.langid.tsv.gz" "gunzip WikiMatrix.v1.en-zh.langid.tsv.gz" zh WikiMatrix.v1.en-zh.langid.tsv 3 2 &
wait
# remove previous results
rm -f all.??
find ./ -maxdepth 1 -name "*.zh" | sort -V | xargs cat > all.zh
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
lid_filter zh all.zh $DEST/train.zh_CN-en_XX.zh_CN en all.en $DEST/train.zh_CN-en_XX.en_XX
}
prepare_tests() {
OUTPUT_DIR=$TMP_DIR
mkdir -p $OUTPUT_DIR
cd $OUTPUT_DIR
wget -nc http://data.statmt.org/wmt20/translation-task/dev.tgz
tar -zxvf dev.tgz
cd dev
cat newsdev2020-jaen-src.ja.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.ja
cat newsdev2020-jaen-ref.en.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.en
split newsdev2020-jaen.ja -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.ja_XX
split newsdev2020-jaen.en -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.en_XX
split newsdev2020-jaen.ja -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.ja_XX
split newsdev2020-jaen.en -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.en_XX
cat newsdev2020-iuen-src.iu.sgm | strip_sgm.sh > newsdev2020-iuen.iu
cat newsdev2020-iuen-ref.en.sgm | strip_sgm.sh > newsdev2020-iuen.en
split newsdev2020-iuen.iu -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.iu_CA
split newsdev2020-iuen.en -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.en_XX
split newsdev2020-iuen.iu -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.iu_CA
split newsdev2020-iuen.en -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.en_XX
cat newsdev2020-taen-src.ta.sgm | strip_sgm.sh > newsdev2020-taen.ta
cat newsdev2020-taen-ref.en.sgm | strip_sgm.sh > newsdev2020-taen.en
split newsdev2020-taen.ta -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.ta_IN
split newsdev2020-taen.en -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.en_XX
split newsdev2020-taen.ta -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.ta_IN
split newsdev2020-taen.en -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.en_XX
cp wikipedia.dev.km-en.km $DEST/valid.km_KH-en_XX.km_KH
cp wikipedia.dev.km-en.en $DEST/valid.km_KH-en_XX.en_XX
cp wikipedia.devtest.km-en.km $DEST/test.km_KH-en_XX.km_KH
cp wikipedia.devtest.km-en.en $DEST/test.km_KH-en_XX.en_XX
cp wikipedia.dev.ps-en.ps $DEST/valid.ps_AF-en_XX.ps_AF
cp wikipedia.dev.ps-en.en $DEST/valid.ps_AF-en_XX.en_XX
cp wikipedia.devtest.ps-en.ps $DEST/test.ps_AF-en_XX.ps_AF
cp wikipedia.devtest.ps-en.en $DEST/test.ps_AF-en_XX.en_XX
cat newsdev2020-plen-src.pl.sgm | strip_sgm.sh > newsdev2020-plen.pl
cat newsdev2020-plen-ref.en.sgm | strip_sgm.sh > newsdev2020-plen.en
split newsdev2020-plen.pl -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.pl_PL
split newsdev2020-plen.en -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.en_XX
split newsdev2020-plen.pl -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.pl_PL
split newsdev2020-plen.en -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.en_XX
cat newstest2018-encs-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.en_XX
cat newstest2018-encs-ref.cs.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.cs_CZ
cat newstest2019-encs-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.en_XX
cat newstest2019-encs-ref.cs.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.cs_CZ
cat newstest2018-deen-src.de.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.de_DE
cat newstest2018-deen-ref.en.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.en_XX
cat newstest2018-ende-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.en_XX
cat newstest2018-ende-ref.de.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.de_DE
cat newstest2019-deen-src.de.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.de_DE
cat newstest2019-deen-ref.en.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.en_XX
cat newstest2019-ende-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.en_XX
cat newstest2019-ende-ref.de.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.de_DE
cat newstest2018-ruen-src.ru.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.ru_RU
cat newstest2018-ruen-ref.en.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.en_XX
cat newstest2018-enru-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.en_XX
cat newstest2018-enru-ref.ru.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.ru_RU
cat newstest2019-ruen-src.ru.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.ru_RU
cat newstest2019-ruen-ref.en.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.en_XX
cat newstest2019-enru-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.en_XX
cat newstest2019-enru-ref.ru.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.ru_RU
cat newstest2018-zhen-src.zh.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.zh_CN
cat newstest2018-zhen-ref.en.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.en_XX
cat newstest2018-enzh-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.en_XX
cat newstest2018-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.zh_CN
cat newstest2019-zhen-src.zh.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.zh_CN
cat newstest2019-zhen-ref.en.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.en_XX
cat newstest2019-enzh-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.en_XX
cat newstest2019-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.zh_CN
}
mkdir -p $DEST
prepare_lid
prepare_moses
download_commoncrawl
prepare_ja &
prepare_ta &
prepare_km &
prepare_ps &
prepare_iu &
prepare_cs &
prepare_de &
prepare_pl &
prepare_ru &
prepare_zh &
# prepare valid/test set
prepare_tests &
# wait
# TODO remove intermediate files
# rm -rf $TMP_DIR
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
if [ -z $WORKDIR_ROOT ] ;
then
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
exit
fi
if [ -z $SPM_PATH ] ;
then
echo "Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting..."
exit
fi
ML50=${WORKDIR_ROOT}/ML50
mkdir -p $ML50/dedup
mkdir -p $ML50/cleaned_dedup
python ./dedup_all.py --from-folder $ML50/raw --to-folder $ML50/dedup
python ./remove_valid_test_in_train.py --from-folder $ML50/dedup --to-folder $ML50/clean
python ./binarize.py --raw-folder $ML50/clean
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment