Fix language inference in generate.py

8943fc78 · Myle Ott · 84b82dc6 · 8943fc78
Commit 8943fc78 authored Oct 17, 2017 by Myle Ott
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 16 deletions

fairseq/data.py fairseq/data.py +18 -16

No files found.
--- a/fairseq/data.py
+++ b/fairseq/data.py
@@ -18,30 +18,32 @@ from fairseq.indexed_dataset import IndexedDataset, IndexedInMemoryDataset
 def load_with_check(path, load_splits, src=None, dst=None):
-    """Loads the train, valid, and test sets from the specified folder
+    """Loads specified data splits (e.g., test, train or valid) from the
-    and check that training files exist."""
+    specified folder and check that files exist."""
    def find_language_pair(files):
-        for filename in files:
+        for split in load_splits:
-            parts = filename.split('.')
+            for filename in files:
-            if parts[0] == 'train' and parts[-1] == 'idx':
+                parts = filename.split('.')
-                return parts[1].split('-')
+                if parts[0] == split and parts[-1] == 'idx':
+                    return parts[1].split('-')
-    def train_file_exists(src, dst):
-        filename = 'train.{0}-{1}.{0}.idx'.format(src, dst)
+    def split_exists(split, src, dst):
+        filename = '{0}.{1}-{2}.{1}.idx'.format(split, src, dst)
        return os.path.exists(os.path.join(path, filename))
    if src is None and dst is None:
        # find language pair automatically
        src, dst = find_language_pair(os.listdir(path))
-    elif train_file_exists(src, dst):
-        # check for src-dst langcode
+    if not split_exists(load_splits[0], src, dst):
-        pass
+        # try reversing src and dst
-    elif train_file_exists(dst, src):
-        # check for dst-src langcode
        src, dst = dst, src
-    else:
-        raise ValueError('training file not found for {}-{}'.format(src, dst))
+    for split in load_splits:
+        if not split_exists(load_splits[0], src, dst):
+            raise ValueError('Data split not found: {}-{} ({})'.format(
+                src, dst, split))
    dataset = load(path, load_splits, src, dst)
    return dataset