from argparse import ArgumentParser, Namespace from pathlib import Path from itertools import chain import os from src.frontend.flaskapp import create_app from src.database import connect_to_database from src.data_ingestion import ( ingest_jmdict, ingest_nhk_easy_news_articles, ingest_tatoeba_sentences ) from src.models import ( Base, jmdict_tables, nhk_tables, tatoeba_tables, ) from src.models import ( TatoebaSentencePairUsesJMDictEntry, NHKEasyNewsArticleUsesJMDictEntry, ) from src.processing import * arg_parser = ArgumentParser() def dir_path(string) -> Path: if os.path.isdir(string): return Path(string) else: return NotADirectoryError(string) arg_parser.add_argument('--reset-all', '-r', action='store_true', help='Reset and recalculate all data') arg_parser.add_argument('--reset-jmdict', '-rj', action='store_true', help='Reingest data from jmdict') arg_parser.add_argument('--reset-nhk-articles', '-rna', action='store_true', help='Reingest data from nhk') arg_parser.add_argument('--reset-tatoeba', '-rt', action='store_true', help='Reingest data from tatoeba sentences') arg_parser.add_argument('--reset-sentence-matches', '-rs', action='store_true', help='Recalculate relations between sentences and jmdict entries') arg_parser.add_argument('--reset-nhk-statistics', '-rns', action='store_true', help='Recalculate relations between sentences and jmdict entries') arg_parser.add_argument('--reset-difficulty-values', '-rd', action='store_true', help='Recalculate difficulty of jmdict entries and sentences') # arg_parser.add_argument('--redownload-jmdict', '-Rj', action='store_true', help='Redownload data from jmdict') # arg_parser.add_argument('--redownload-nhk', '-Rn', action='store_true', help='Redownload data from nhk') # arg_parser.add_argument('--redownload-tatoeba', '-Rt', action='store_true', help='Redownload data from tatoeba sentences') arg_parser.add_argument('--echo-sql', '-e', action='store_true', help='Echo SQL statements') arg_parser.add_argument('--data-dir', '-d', type=dir_path, default=Path('./data'), help='Directory where data is stored') arg_parser.add_argument('--use-memory-db', '-m', action='store_true', help='Use an in-memory database for debugging purposes') def _reingest_specified_tables(args: Namespace) -> None: tables = [ jmdict_tables if args.reset_jmdict else [], nhk_tables if args.reset_nhk_articles else [], tatoeba_tables if args.reset_tatoeba else [], ] tables = list(chain.from_iterable(tables)) tables = [table.__table__ for table in tables] session_maker = connect_to_database( args.data_dir, args.echo_sql, args.use_memory_db, ) with session_maker() as session: Base.metadata.drop_all(session.get_bind(), tables) Base.metadata.create_all(session.get_bind(), tables) if args.reset_jmdict: ingest_jmdict(session, args.data_dir) if args.reset_nhk_articles: ingest_nhk_easy_news_articles(session, args.data_dir) if args.reset_tatoeba: ingest_tatoeba_sentences(session, args.data_dir) def _reprocess_sentence_entry_matches(args: Namespace) -> None: session_maker = connect_to_database( args.data_dir, args.echo_sql, args.use_memory_db, ) with session_maker() as session: if any([ args.reset_jmdict, args.reset_nhk_articles, args.reset_sentence_matches, ]): Base.metadata.drop_all(session.get_bind(), [NHKEasyNewsArticleUsesJMDictEntry.__table__]) Base.metadata.create_all(session.get_bind(), [NHKEasyNewsArticleUsesJMDictEntry.__table__]) connect_nhk_easy_news_articles_to_jmdict_entries(session) if any([ args.reset_jmdict, args.reset_tatoeba, args.reset_sentence_matches, ]): Base.metadata.drop_all(session.get_bind(), [TatoebaSentencePairUsesJMDictEntry.__table__]) Base.metadata.create_all(session.get_bind(), [TatoebaSentencePairUsesJMDictEntry.__table__]) connect_tatoeba_sentences_to_jmdict_entries(session) def _recalculate_nhk_statistics(args: Namespace) -> None: session_maker = connect_to_database( args.data_dir, args.echo_sql, args.use_memory_db, ) with session_maker() as session: calculate_word_frequency_of_nhk_easy_news_articles(session) def _recalculate_difficulty_values(args: Namespace) -> None: session_maker = connect_to_database( args.data_dir, args.echo_sql, args.use_memory_db, ) with session_maker() as session: calculate_difficulty_values_of_all_words_and_sentences(session) if __name__ == "__main__": args = arg_parser.parse_args() if args.reset_all: args.reset_jmdict = True args.reset_nhk_articles = True args.reset_tatoeba = True args.reset_sentence_matches = True args.reset_nhk_statistics = True args.reset_difficulty_values = True # if args.redownload: # print('Redownloading data...') # TODO: download jmdict, nhk easy news, tanaka corpus # pass changed_database = False if any([ args.reset_jmdict, args.reset_nhk_articles, args.reset_tatoeba, ]): changed_database = True _reingest_specified_tables(args) if any([ args.reset_jmdict, args.reset_nhk_articles, args.reset_tatoeba, args.reset_sentence_matches, ]): changed_database = True _reprocess_sentence_entry_matches(args) if any([ args.reset_nhk_articles, args.reset_nhk_statistics, ]): changed_database = True _recalculate_nhk_statistics(args) if any([ args.reset_jmdict, args.reset_nhk_articles, args.reset_tatoeba, args.reset_sentence_matches, args.reset_nhk_statistics, args.reset_difficulty_values, ]): changed_database = True _recalculate_difficulty_values(args) if changed_database: print('Recreated the database with newly processed data!') print('You can now run the app without the --reset-db flag to start the webserver.') else: app = create_app(args) app.run(debug=True)