Unverified Commit 4560556d authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Fix redirect behavior of datasets.utils.download_url (#3564)

* use head request for redirects

* remove requests dependency
parent c808d163
...@@ -61,18 +61,20 @@ def check_integrity(fpath: str, md5: Optional[str] = None) -> bool: ...@@ -61,18 +61,20 @@ def check_integrity(fpath: str, md5: Optional[str] = None) -> bool:
return check_md5(fpath, md5) return check_md5(fpath, md5)
def _get_redirect_url(url: str, max_hops: int = 10) -> str: def _get_redirect_url(url: str, max_hops: int = 3) -> str:
import requests initial_url = url
headers = {"Method": "HEAD", "User-Agent": USER_AGENT}
for hop in range(max_hops + 1):
response = requests.get(url)
for _ in range(max_hops + 1):
with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as response:
if response.url == url or response.url is None: if response.url == url or response.url is None:
return url return url
url = response.url url = response.url
else: else:
raise RecursionError(f"Too many redirects: {max_hops + 1})") raise RecursionError(
f"Request to {initial_url} exceeded {max_hops} redirects. The last redirect points to {url}."
)
def _get_google_drive_file_id(url: str) -> Optional[str]: def _get_google_drive_file_id(url: str) -> Optional[str]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment