uncompress.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import glob
import tarfile
import time
import zipfile
import functools
import requests
import shutil

lasttime = time.time()
FLUSH_INTERVAL = 0.1


class uncompressor:
    def __init__(self, download_params):
        if download_params is not None:
            urls, savepath, print_progress = download_params
            for key, url in urls.items():
                if url:
                    self._download_file(
                        url,
                        savepath=os.path.join(savepath, key),
                        print_progress=print_progress)

    def _uncompress_file_zip(self, filepath, extrapath):
        files = zipfile.ZipFile(filepath, 'r')
        filelist = files.namelist()
        rootpath = filelist[0]
        total_num = len(filelist)
        for index, file in enumerate(filelist):
            files.extract(file, extrapath)
            yield total_num, index, rootpath
        files.close()
        yield total_num, index, rootpath

    def progress(self, str, end=False):
        global lasttime
        if end:
            str += "\n"
            lasttime = 0
        if time.time() - lasttime >= FLUSH_INTERVAL:
            sys.stdout.write("\r%s" % str)
            lasttime = time.time()
            sys.stdout.flush()

    def _uncompress_file_tar(self, filepath, extrapath, mode="r:gz"):
        files = tarfile.open(filepath, mode)
        filelist = files.getnames()
        total_num = len(filelist)
        rootpath = filelist[0]
        for index, file in enumerate(filelist):
            files.extract(file, extrapath)
            yield total_num, index, rootpath
        files.close()
        yield total_num, index, rootpath

    def _uncompress_file(self, filepath, extrapath, delete_file,
                         print_progress):
        if print_progress:
            print("Uncompress %s" % os.path.basename(filepath))

        if filepath.endswith("zip"):
            handler = self._uncompress_file_zip
        elif filepath.endswith(("tgz", "tar", "tar.gz")):
            handler = functools.partial(self._uncompress_file_tar, mode="r:*")
        else:
            handler = functools.partial(self._uncompress_file_tar, mode="r")

        for total_num, index, rootpath in handler(filepath, extrapath):
            if print_progress:
                done = int(50 * float(index) / total_num)
                self.progress("[%-50s] %.2f%%" %
                              ('=' * done, float(100 * index) / total_num))
        if print_progress:
            self.progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True)

        if delete_file:
            os.remove(filepath)

        return rootpath

    def _download_file(self, url, savepath, print_progress):
        if print_progress:
            print("Connecting to {}".format(url))
        r = requests.get(url, stream=True, timeout=15)
        total_length = r.headers.get('content-length')

        if total_length is None:
            with open(savepath, 'wb') as f:
                shutil.copyfileobj(r.raw, f)
        else:
            total_length = int(total_length)
            if os.path.exists(savepath) and total_length == os.path.getsize(
                    savepath):
                print("{} already downloaded, skipping".format(
                    os.path.basename(savepath)))
                return
            with open(savepath, 'wb') as f:
                dl = 0
                total_length = int(total_length)
                starttime = time.time()
                if print_progress:
                    print("Downloading %s" % os.path.basename(savepath))
                for data in r.iter_content(chunk_size=4096):
                    dl += len(data)
                    f.write(data)
                    if print_progress:
                        done = int(50 * dl / total_length)
                        self.progress(
                            "[%-50s] %.2f%%" %
                            ('=' * done, float(100 * dl) / total_length))
            if print_progress:
                self.progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True)