91 lines
2.7 KiB
Python
91 lines
2.7 KiB
Python
from ...utils.helpers import make_relative
|
|
from pathlib import Path
|
|
from tqdm import tqdm
|
|
from typing import Union, Optional
|
|
import io
|
|
import os
|
|
import json
|
|
import requests
|
|
|
|
PathLike = Union[os.PathLike, str]
|
|
|
|
__doc__ = """
|
|
Here are some helper functions for processing data.
|
|
"""
|
|
|
|
def check_url(url): # HTTP HEAD
|
|
return requests.head(url).ok
|
|
|
|
def download_stream(
|
|
url : str,
|
|
file_object,
|
|
block_size : int = 1024,
|
|
silent : bool = False,
|
|
label : Optional[str] = None,
|
|
):
|
|
resp = requests.get(url, stream=True)
|
|
total_size = int(resp.headers.get("content-length", 0))
|
|
if not silent:
|
|
progress_bar = tqdm(total=total_size , unit="iB", unit_scale=True, desc=label)
|
|
|
|
for chunk in resp.iter_content(block_size):
|
|
if not silent:
|
|
progress_bar.update(len(chunk))
|
|
file_object.write(chunk)
|
|
|
|
if not silent:
|
|
progress_bar.close()
|
|
if total_size != 0 and progress_bar.n != total_size:
|
|
print("ERROR, something went wrong")
|
|
|
|
def download_data(
|
|
url : str,
|
|
block_size : int = 1024,
|
|
silent : bool = False,
|
|
label : Optional[str] = None,
|
|
) -> bytearray:
|
|
f = io.BytesIO()
|
|
download_stream(url, f, block_size=block_size, silent=silent, label=label)
|
|
f.seek(0)
|
|
return bytearray(f.read())
|
|
|
|
def download_file(
|
|
url : str,
|
|
fname : Union[Path, str],
|
|
block_size : int = 1024,
|
|
silent = False,
|
|
):
|
|
if not isinstance(fname, Path):
|
|
fname = Path(fname)
|
|
with fname.open("wb") as f:
|
|
download_stream(url, f, block_size=block_size, silent=silent, label=make_relative(fname, Path.cwd()).name)
|
|
|
|
def is_downloaded(
|
|
target_dir : PathLike,
|
|
url : str,
|
|
*,
|
|
add : bool = False,
|
|
dbfiles : Union[list[PathLike], PathLike],
|
|
):
|
|
if not isinstance(target_dir, os.PathLike):
|
|
target_dir = Path(target_dir)
|
|
if not isinstance(dbfiles, list):
|
|
dbfiles = [dbfiles]
|
|
if not dbfiles:
|
|
raise ValueError("'dbfiles' empty")
|
|
downloaded = set()
|
|
for dbfile_fname in dbfiles:
|
|
dbfile_fname = target_dir / dbfile_fname
|
|
if dbfile_fname.is_file():
|
|
with open(dbfile_fname, "r") as f:
|
|
downloaded.update(json.load(f)["downloaded"])
|
|
|
|
if add and url not in downloaded:
|
|
downloaded.add(url)
|
|
with open(dbfiles[0], "w") as f:
|
|
data = {"downloaded": sorted(downloaded)}
|
|
json.dump(data, f, indent=2, sort_keys=True)
|
|
return True
|
|
|
|
return url in downloaded
|