fix python 2.7 imports

2071a9b8 · thomwolf · 8197eb9f · 2071a9b8 · 2071a9b8 · 2071a9b8
Commit 2071a9b8 authored Feb 11, 2019 by thomwolf
Showing with 15 additions and 2 deletions

README.md README.md +13 -0

pytorch_pretrained_bert/file_utils.py pytorch_pretrained_bert/file_utils.py +1 -1

tests/tokenization_openai_test.py tests/tokenization_openai_test.py +1 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -45,6 +45,12 @@ PyTorch pretrained bert can be installed by pip as follows:
 pip install pytorch-pretrained-bert
 ```
+If you want to use the tokenizer associated to the `OpenAI GPT` tokenizer, you will need to install `ftfy` (if you are using Python 2, version 4.4.3 is the last version working for you) and `SpaCy` :
+```bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
 ### From source
 Clone the repository and run:
@@ -52,6 +58,13 @@ Clone the repository and run:
 pip install [--editable] .
 ```
+Here also, if you want to use `OpenAIGPT` tokenizer, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+```bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
 A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 You can run the tests with the command:

--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -29,7 +29,7 @@ try:
    from pathlib import Path
    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                                   Path.home() / '.pytorch_pretrained_bert'))
-except ImportError:
+except AttributeError:
    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))

--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -32,7 +32,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w", encoding='utf-8') as fp:
+        with open("/tmp/openai_tokenizer_vocab_test.json", "wb") as fp:
            json.dump(vocab_tokens, fp)
            vocab_file = fp.name
        with open("/tmp/openai_tokenizer_merges_test.txt", "w", encoding='utf-8') as fp: