Merge pull request #911 from huggingface/small_fixes

Small fixes

Merge pull request #911 from huggingface/small_fixes
Small fixes
6b763d04 · Thomas Wolf · GitHub · c054b5ee · 7b6e474c · 6b763d04
Unverified Commit 6b763d04 authored Jul 26, 2019 by Thomas Wolf Committed by GitHub Jul 26, 2019
Show whitespace changes
Inline Side-by-side

Showing with 34 additions and 14 deletions

pytorch_transformers/__init__.py pytorch_transformers/__init__.py +10 -10

pytorch_transformers/tokenization_utils.py pytorch_transformers/tokenization_utils.py +24 -4

No files found.
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)

-from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
+from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
                            BertForMaskedLM, BertForNextSentencePrediction,
                            BertForSequenceClassification, BertForMultipleChoice,
                            BertForTokenClassification, BertForQuestionAnswering,
                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                            BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
-from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
+from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                              OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_gpt2 import (GPT2Config, GPT2Model,
+from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model,
                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
                            GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig,
                             XLNetForSequenceClassification, XLNetForQuestionAnswering,
                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
                             XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlm import (XLMConfig, XLMModel,
+from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
                           XLMWithLMHeadModel, XLMForSequenceClassification,
                           XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
                           XLM_PRETRAINED_MODEL_ARCHIVE_MAP)

--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -160,26 +160,46 @@ class PreTrainedTokenizer(object):
        s3_models = list(cls.max_model_input_sizes.keys())
        vocab_files = {}
        if pretrained_model_name_or_path in s3_models:
+            # Get the vocabulary from AWS S3 bucket
            for file_id, map_list in cls.pretrained_vocab_files_map.items():
                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
        else:
+            # Get the vocabulary from local files
            logger.info(
                "Model name '{}' not found in model shortcut name list ({}). "
                "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
                    pretrained_model_name_or_path, ', '.join(s3_models),
                    pretrained_model_name_or_path))
-            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
-            all_vocab_files_names.update(cls.vocab_files_names)
-            for file_id, file_name in all_vocab_files_names.items():
+
+            # Look for the tokenizer main vocabulary files
+            for file_id, file_name in cls.vocab_files_names.items():
                if os.path.isdir(pretrained_model_name_or_path):
+                    # If a directory is provided we look for the standard filenames
                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                else:
+                    # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                    full_file_name = pretrained_model_name_or_path
                if not os.path.exists(full_file_name):
                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
                    full_file_name = None
                vocab_files[file_id] = full_file_name
+
+            # Look for the additional tokens files
+            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+
+            # If a path to a file was provided, get the parent directory
+            saved_directory = pretrained_model_name_or_path
+            if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
+                saved_directory = os.path.dirname(saved_directory)
+
+            for file_id, file_name in all_vocab_files_names.items():
+                full_file_name = os.path.join(saved_directory, file_name)
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+
            if all(full_file_name is None for full_file_name in vocab_files.values()):
                logger.error(
                    "Model name '{}' was not found in model name list ({}). "