diff --git a/hosts/tsuki/configuration.nix b/hosts/tsuki/configuration.nix index 48c775b..c1fa5af 100644 --- a/hosts/tsuki/configuration.nix +++ b/hosts/tsuki/configuration.nix @@ -26,6 +26,8 @@ ./services/postgres.nix ./services/vaultwarden.nix ./services/vscode-server.nix + + ./services/scrapers/nhk-easy-news/default.nix ]; machineVars = { diff --git a/hosts/tsuki/services/scrapers/nhk-easy-news/default.nix b/hosts/tsuki/services/scrapers/nhk-easy-news/default.nix new file mode 100644 index 0000000..4e394f7 --- /dev/null +++ b/hosts/tsuki/services/scrapers/nhk-easy-news/default.nix @@ -0,0 +1,30 @@ +{ config, pkgs, lib, ... }: let + cfg = config.systemd.services.scrape-nhk-easy-news; + script = pkgs.writers.writePython3 "scrape-nhk-easy-news.py" { + libraries = with pkgs.python3Packages; [ requests wget ]; + } (lib.fileContents ./script.py); +in { + systemd.services.scrape-nhk-easy-news = { + after = [ "network.target" ]; + serviceConfig = { + Type = "oneshot"; + ExecStart = script; + DynamicUser = true; + PrivateTmp = true; + ProtectHome = true; + ProtectProc = "invisible"; + ProtectSystem = "strict"; + WorkingDirectory = "/var/lib/nhk_easy_news_scraper"; + StateDirectory = "nhk_easy_news_scraper"; + StateDirectoryMode = "0755"; + }; + }; + + systemd.timers.scrape-nhk-easy-news = { + wantedBy = [ "timers.target" ]; + timerConfig = { + Unit = "scrape-nhk-easy-news.service"; + OnCalendar = "*-*-* 03:00:00"; + }; + }; +} diff --git a/hosts/tsuki/services/scrapers/nhk-easy-news/script.py b/hosts/tsuki/services/scrapers/nhk-easy-news/script.py new file mode 100644 index 0000000..ce7b2fc --- /dev/null +++ b/hosts/tsuki/services/scrapers/nhk-easy-news/script.py @@ -0,0 +1,40 @@ +from pathlib import Path +import os + +import requests +import wget + + +def main(): + nhkjson = requests.get( + 'http://www3.nhk.or.jp/news/easy/news-list.json').json() + base_dir = Path(".").resolve() + + if not (base_dir / 'articles').exists(): + os.mkdir(base_dir / 'articles') + + for key, value in nhkjson[0].items(): + for x in value: + news_id = x['news_id'] + path = base_dir / f'articles/nhkeasy_{news_id}.html' + + if path.exists(): + # This means that the article has already been downloaded. + # Skip and continue + continue + + print(f"New article with ID: {news_id}") + try: + nhkurl = x['news_web_url'] + wget.download(nhkurl, out=str(path)) + print("Successful download of article ID: " + x['news_id']) + except Exception as err: + if path.exists(): + os.remove(path) + print("Failed to download article ID: " + x['news_id']) + print(err) + + +if __name__ == '__main__': + main() +