Commit 43489756 authored by thomwolf's avatar thomwolf
Browse files

adding proxies options for the from_pretrained methods

parent 6d0aa739
...@@ -128,3 +128,5 @@ proc_data ...@@ -128,3 +128,5 @@ proc_data
# examples # examples
runs runs
examples/runs examples/runs
data
\ No newline at end of file
...@@ -17,8 +17,9 @@ from hashlib import sha256 ...@@ -17,8 +17,9 @@ from hashlib import sha256
from io import open from io import open
import boto3 import boto3
import requests from botocore.config import Config
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
import requests
from tqdm import tqdm from tqdm import tqdm
try: try:
...@@ -93,7 +94,7 @@ def filename_to_url(filename, cache_dir=None): ...@@ -93,7 +94,7 @@ def filename_to_url(filename, cache_dir=None):
return url, etag return url, etag
def cached_path(url_or_filename, cache_dir=None, force_download=False): def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
""" """
Given something that might be a URL (or might be a local path), Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and determine which. If it's a URL, download the file and cache it, and
...@@ -114,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False): ...@@ -114,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False):
if parsed.scheme in ('http', 'https', 's3'): if parsed.scheme in ('http', 'https', 's3'):
# URL, so get it from the cache (downloading if necessary) # URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download) return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
elif os.path.exists(url_or_filename): elif os.path.exists(url_or_filename):
# File, and it exists. # File, and it exists.
return url_or_filename return url_or_filename
...@@ -159,24 +160,24 @@ def s3_request(func): ...@@ -159,24 +160,24 @@ def s3_request(func):
@s3_request @s3_request
def s3_etag(url): def s3_etag(url, proxies=None):
"""Check ETag on S3 object.""" """Check ETag on S3 object."""
s3_resource = boto3.resource("s3") s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
bucket_name, s3_path = split_s3_path(url) bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path) s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag return s3_object.e_tag
@s3_request @s3_request
def s3_get(url, temp_file): def s3_get(url, temp_file, proxies=None):
"""Pull a file directly from S3.""" """Pull a file directly from S3."""
s3_resource = boto3.resource("s3") s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
bucket_name, s3_path = split_s3_path(url) bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file): def http_get(url, temp_file, proxies=None):
req = requests.get(url, stream=True) req = requests.get(url, stream=True, proxies=proxies)
content_length = req.headers.get('Content-Length') content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total) progress = tqdm(unit="B", total=total)
...@@ -187,7 +188,7 @@ def http_get(url, temp_file): ...@@ -187,7 +188,7 @@ def http_get(url, temp_file):
progress.close() progress.close()
def get_from_cache(url, cache_dir=None, force_download=False): def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
""" """
Given a URL, look for the corresponding dataset in the local cache. Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file. If it's not there, download it. Then return the path to the cached file.
...@@ -204,10 +205,10 @@ def get_from_cache(url, cache_dir=None, force_download=False): ...@@ -204,10 +205,10 @@ def get_from_cache(url, cache_dir=None, force_download=False):
# Get eTag to add to filename, if it exists. # Get eTag to add to filename, if it exists.
if url.startswith("s3://"): if url.startswith("s3://"):
etag = s3_etag(url) etag = s3_etag(url, proxies=proxies)
else: else:
try: try:
response = requests.head(url, allow_redirects=True) response = requests.head(url, allow_redirects=True, proxies=proxies)
if response.status_code != 200: if response.status_code != 200:
etag = None etag = None
else: else:
...@@ -238,9 +239,9 @@ def get_from_cache(url, cache_dir=None, force_download=False): ...@@ -238,9 +239,9 @@ def get_from_cache(url, cache_dir=None, force_download=False):
# GET file object # GET file object
if url.startswith("s3://"): if url.startswith("s3://"):
s3_get(url, temp_file) s3_get(url, temp_file, proxies=proxies)
else: else:
http_get(url, temp_file) http_get(url, temp_file, proxies=proxies)
# we are copying the file before closing it, so flush to avoid truncation # we are copying the file before closing it, so flush to avoid truncation
temp_file.flush() temp_file.flush()
......
...@@ -128,6 +128,10 @@ class PretrainedConfig(object): ...@@ -128,6 +128,10 @@ class PretrainedConfig(object):
force_download: (`optional`) boolean, default False: force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
return_unused_kwargs: (`optional`) bool: return_unused_kwargs: (`optional`) bool:
- If False, then this function returns just the final configuration object. - If False, then this function returns just the final configuration object.
...@@ -150,6 +154,7 @@ class PretrainedConfig(object): ...@@ -150,6 +154,7 @@ class PretrainedConfig(object):
""" """
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
if pretrained_model_name_or_path in cls.pretrained_config_archive_map: if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
...@@ -160,7 +165,7 @@ class PretrainedConfig(object): ...@@ -160,7 +165,7 @@ class PretrainedConfig(object):
config_file = pretrained_model_name_or_path config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download) resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in cls.pretrained_config_archive_map: if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
logger.error( logger.error(
...@@ -407,6 +412,10 @@ class PreTrainedModel(nn.Module): ...@@ -407,6 +412,10 @@ class PreTrainedModel(nn.Module):
force_download: (`optional`) boolean, default False: force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean: output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
...@@ -432,6 +441,7 @@ class PreTrainedModel(nn.Module): ...@@ -432,6 +441,7 @@ class PreTrainedModel(nn.Module):
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
from_tf = kwargs.pop('from_tf', False) from_tf = kwargs.pop('from_tf', False)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
output_loading_info = kwargs.pop('output_loading_info', False) output_loading_info = kwargs.pop('output_loading_info', False)
# Load config # Load config
...@@ -462,7 +472,7 @@ class PreTrainedModel(nn.Module): ...@@ -462,7 +472,7 @@ class PreTrainedModel(nn.Module):
archive_file = pretrained_model_name_or_path archive_file = pretrained_model_name_or_path
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map: if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error( logger.error(
......
...@@ -196,6 +196,10 @@ class PreTrainedTokenizer(object): ...@@ -196,6 +196,10 @@ class PreTrainedTokenizer(object):
force_download: (`optional`) boolean, default False: force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists. Force to (re-)download the vocabulary files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
...@@ -227,6 +231,7 @@ class PreTrainedTokenizer(object): ...@@ -227,6 +231,7 @@ class PreTrainedTokenizer(object):
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
s3_models = list(cls.max_model_input_sizes.keys()) s3_models = list(cls.max_model_input_sizes.keys())
vocab_files = {} vocab_files = {}
...@@ -287,7 +292,7 @@ class PreTrainedTokenizer(object): ...@@ -287,7 +292,7 @@ class PreTrainedTokenizer(object):
if file_path is None: if file_path is None:
resolved_vocab_files[file_id] = None resolved_vocab_files[file_id] = None
else: else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download) resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
logger.error("Couldn't reach server to download vocabulary.") logger.error("Couldn't reach server to download vocabulary.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment