tsuki: add timed nhk easy news scraper

2023-07-12 01:57:20 +02:00 · 2023-07-12 01:57:20 +02:00 · 82ea6e9f5a
commit 82ea6e9f5a
parent dddc92877c
3 changed files with 72 additions and 0 deletions
--- a/hosts/tsuki/configuration.nix
+++ b/hosts/tsuki/configuration.nix
@ -26,6 +26,8 @@
    ./services/postgres.nix
    ./services/vaultwarden.nix
    ./services/vscode-server.nix
+
+    ./services/scrapers/nhk-easy-news/default.nix
  ];

  machineVars = {
--- a/hosts/tsuki/services/scrapers/nhk-easy-news/default.nix
+++ b/hosts/tsuki/services/scrapers/nhk-easy-news/default.nix
@ -0,0 +1,30 @@
+{ config, pkgs, lib, ... }: let
+  cfg = config.systemd.services.scrape-nhk-easy-news;
+  script = pkgs.writers.writePython3 "scrape-nhk-easy-news.py" {
+    libraries = with pkgs.python3Packages; [ requests wget ];
+  } (lib.fileContents ./script.py);
+in {
+  systemd.services.scrape-nhk-easy-news = {
+    after = [ "network.target" ];
+    serviceConfig = {
+      Type = "oneshot";
+      ExecStart = script;
+      DynamicUser = true;
+      PrivateTmp = true;
+      ProtectHome = true;
+      ProtectProc = "invisible";
+      ProtectSystem = "strict";
+      WorkingDirectory = "/var/lib/nhk_easy_news_scraper";
+      StateDirectory = "nhk_easy_news_scraper";
+      StateDirectoryMode = "0755";
+    };
+  };
+
+  systemd.timers.scrape-nhk-easy-news = {
+    wantedBy = [ "timers.target" ];
+    timerConfig = {
+      Unit = "scrape-nhk-easy-news.service";
+      OnCalendar = "*-*-* 03:00:00";
+    };
+  };
+}
--- a/hosts/tsuki/services/scrapers/nhk-easy-news/script.py
+++ b/hosts/tsuki/services/scrapers/nhk-easy-news/script.py
@ -0,0 +1,40 @@
+from pathlib import Path
+import os
+
+import requests
+import wget
+
+
+def main():
+    nhkjson = requests.get(
+        'http://www3.nhk.or.jp/news/easy/news-list.json').json()
+    base_dir = Path(".").resolve()
+
+    if not (base_dir / 'articles').exists():
+        os.mkdir(base_dir / 'articles')
+
+    for key, value in nhkjson[0].items():
+        for x in value:
+            news_id = x['news_id']
+            path = base_dir / f'articles/nhkeasy_{news_id}.html'
+
+            if path.exists():
+                # This means that the article has already been downloaded.
+                # Skip and continue
+                continue
+
+            print(f"New article with ID: {news_id}")
+            try:
+                nhkurl = x['news_web_url']
+                wget.download(nhkurl, out=str(path))
+                print("Successful download of article ID: " + x['news_id'])
+            except Exception as err:
+                if path.exists():
+                    os.remove(path)
+                print("Failed to download article ID: " + x['news_id'])
+                print(err)
+
+
+if __name__ == '__main__':
+    main()
+