Files
marf/ifield/data/common/download.py
2025-01-09 15:43:11 +01:00

91 lines
2.7 KiB
Python

from ...utils.helpers import make_relative
from pathlib import Path
from tqdm import tqdm
from typing import Union, Optional
import io
import os
import json
import requests
PathLike = Union[os.PathLike, str]
__doc__ = """
Here are some helper functions for processing data.
"""
def check_url(url): # HTTP HEAD
return requests.head(url).ok
def download_stream(
url : str,
file_object,
block_size : int = 1024,
silent : bool = False,
label : Optional[str] = None,
):
resp = requests.get(url, stream=True)
total_size = int(resp.headers.get("content-length", 0))
if not silent:
progress_bar = tqdm(total=total_size , unit="iB", unit_scale=True, desc=label)
for chunk in resp.iter_content(block_size):
if not silent:
progress_bar.update(len(chunk))
file_object.write(chunk)
if not silent:
progress_bar.close()
if total_size != 0 and progress_bar.n != total_size:
print("ERROR, something went wrong")
def download_data(
url : str,
block_size : int = 1024,
silent : bool = False,
label : Optional[str] = None,
) -> bytearray:
f = io.BytesIO()
download_stream(url, f, block_size=block_size, silent=silent, label=label)
f.seek(0)
return bytearray(f.read())
def download_file(
url : str,
fname : Union[Path, str],
block_size : int = 1024,
silent = False,
):
if not isinstance(fname, Path):
fname = Path(fname)
with fname.open("wb") as f:
download_stream(url, f, block_size=block_size, silent=silent, label=make_relative(fname, Path.cwd()).name)
def is_downloaded(
target_dir : PathLike,
url : str,
*,
add : bool = False,
dbfiles : Union[list[PathLike], PathLike],
):
if not isinstance(target_dir, os.PathLike):
target_dir = Path(target_dir)
if not isinstance(dbfiles, list):
dbfiles = [dbfiles]
if not dbfiles:
raise ValueError("'dbfiles' empty")
downloaded = set()
for dbfile_fname in dbfiles:
dbfile_fname = target_dir / dbfile_fname
if dbfile_fname.is_file():
with open(dbfile_fname, "r") as f:
downloaded.update(json.load(f)["downloaded"])
if add and url not in downloaded:
downloaded.add(url)
with open(dbfiles[0], "w") as f:
data = {"downloaded": sorted(downloaded)}
json.dump(data, f, indent=2, sort_keys=True)
return True
return url in downloaded