191 lines
5.9 KiB
Python
191 lines
5.9 KiB
Python
from argparse import ArgumentParser, Namespace
|
|
from pathlib import Path
|
|
from itertools import chain
|
|
import os
|
|
|
|
|
|
from src.frontend.flaskapp import create_app
|
|
from src.database import connect_to_database
|
|
from src.data_ingestion import (
|
|
ingest_jmdict,
|
|
ingest_nhk_easy_news_articles,
|
|
ingest_tatoeba_sentences
|
|
)
|
|
from src.models import (
|
|
Base,
|
|
jmdict_tables,
|
|
nhk_tables,
|
|
tatoeba_tables,
|
|
)
|
|
from src.models import (
|
|
TatoebaSentencePairUsesJMDictEntry,
|
|
NHKEasyNewsArticleUsesJMDictEntry,
|
|
)
|
|
|
|
from src.processing import *
|
|
|
|
arg_parser = ArgumentParser()
|
|
|
|
def dir_path(string) -> Path:
|
|
if os.path.isdir(string):
|
|
return Path(string)
|
|
else:
|
|
return NotADirectoryError(string)
|
|
|
|
arg_parser.add_argument('--reset-all', '-r', action='store_true', help='Reset and recalculate all data')
|
|
arg_parser.add_argument('--reset-jmdict', '-rj', action='store_true', help='Reingest data from jmdict')
|
|
arg_parser.add_argument('--reset-nhk-articles', '-rna', action='store_true', help='Reingest data from nhk')
|
|
arg_parser.add_argument('--reset-tatoeba', '-rt', action='store_true', help='Reingest data from tatoeba sentences')
|
|
|
|
arg_parser.add_argument('--reset-sentence-matches', '-rs', action='store_true', help='Recalculate relations between sentences and jmdict entries')
|
|
arg_parser.add_argument('--reset-nhk-statistics', '-rns', action='store_true', help='Recalculate relations between sentences and jmdict entries')
|
|
arg_parser.add_argument('--reset-difficulty-values', '-rd', action='store_true', help='Recalculate difficulty of jmdict entries and sentences')
|
|
|
|
# arg_parser.add_argument('--redownload-jmdict', '-Rj', action='store_true', help='Redownload data from jmdict')
|
|
# arg_parser.add_argument('--redownload-nhk', '-Rn', action='store_true', help='Redownload data from nhk')
|
|
# arg_parser.add_argument('--redownload-tatoeba', '-Rt', action='store_true', help='Redownload data from tatoeba sentences')
|
|
|
|
arg_parser.add_argument('--echo-sql', '-e', action='store_true', help='Echo SQL statements')
|
|
arg_parser.add_argument('--data-dir', '-d', type=dir_path, default=Path('./data'), help='Directory where data is stored')
|
|
arg_parser.add_argument('--use-memory-db', '-m', action='store_true', help='Use an in-memory database for debugging purposes')
|
|
|
|
def _reingest_specified_tables(args: Namespace) -> None:
|
|
tables = [
|
|
jmdict_tables if args.reset_jmdict else [],
|
|
nhk_tables if args.reset_nhk_articles else [],
|
|
tatoeba_tables if args.reset_tatoeba else [],
|
|
]
|
|
|
|
tables = list(chain.from_iterable(tables))
|
|
tables = [table.__table__ for table in tables]
|
|
|
|
session_maker = connect_to_database(
|
|
args.data_dir,
|
|
args.echo_sql,
|
|
args.use_memory_db,
|
|
)
|
|
|
|
with session_maker() as session:
|
|
Base.metadata.drop_all(session.get_bind(), tables)
|
|
Base.metadata.create_all(session.get_bind(), tables)
|
|
|
|
if args.reset_jmdict:
|
|
ingest_jmdict(session, args.data_dir)
|
|
|
|
if args.reset_nhk_articles:
|
|
ingest_nhk_easy_news_articles(session, args.data_dir)
|
|
|
|
if args.reset_tatoeba:
|
|
ingest_tatoeba_sentences(session, args.data_dir)
|
|
|
|
|
|
def _reprocess_sentence_entry_matches(args: Namespace) -> None:
|
|
session_maker = connect_to_database(
|
|
args.data_dir,
|
|
args.echo_sql,
|
|
args.use_memory_db,
|
|
)
|
|
|
|
with session_maker() as session:
|
|
if any([
|
|
args.reset_jmdict,
|
|
args.reset_nhk_articles,
|
|
args.reset_sentence_matches,
|
|
]):
|
|
Base.metadata.drop_all(session.get_bind(), [NHKEasyNewsArticleUsesJMDictEntry.__table__])
|
|
Base.metadata.create_all(session.get_bind(), [NHKEasyNewsArticleUsesJMDictEntry.__table__])
|
|
connect_nhk_easy_news_articles_to_jmdict_entries(session)
|
|
|
|
if any([
|
|
args.reset_jmdict,
|
|
args.reset_tatoeba,
|
|
args.reset_sentence_matches,
|
|
]):
|
|
Base.metadata.drop_all(session.get_bind(), [TatoebaSentencePairUsesJMDictEntry.__table__])
|
|
Base.metadata.create_all(session.get_bind(), [TatoebaSentencePairUsesJMDictEntry.__table__])
|
|
connect_tatoeba_sentences_to_jmdict_entries(session)
|
|
|
|
|
|
def _recalculate_nhk_statistics(args: Namespace) -> None:
|
|
session_maker = connect_to_database(
|
|
args.data_dir,
|
|
args.echo_sql,
|
|
args.use_memory_db,
|
|
)
|
|
|
|
with session_maker() as session:
|
|
calculate_word_frequency_of_nhk_easy_news_articles(session)
|
|
|
|
|
|
def _recalculate_difficulty_values(args: Namespace) -> None:
|
|
session_maker = connect_to_database(
|
|
args.data_dir,
|
|
args.echo_sql,
|
|
args.use_memory_db,
|
|
)
|
|
|
|
with session_maker() as session:
|
|
calculate_difficulty_values_of_all_words_and_sentences(session)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = arg_parser.parse_args()
|
|
|
|
if args.reset_all:
|
|
args.reset_jmdict = True
|
|
args.reset_nhk_articles = True
|
|
args.reset_tatoeba = True
|
|
args.reset_sentence_matches = True
|
|
args.reset_nhk_statistics = True
|
|
args.reset_difficulty_values = True
|
|
|
|
# if args.redownload:
|
|
# print('Redownloading data...')
|
|
# TODO: download jmdict, nhk easy news, tanaka corpus
|
|
# pass
|
|
|
|
changed_database = False
|
|
|
|
if any([
|
|
args.reset_jmdict,
|
|
args.reset_nhk_articles,
|
|
args.reset_tatoeba,
|
|
]):
|
|
changed_database = True
|
|
_reingest_specified_tables(args)
|
|
|
|
if any([
|
|
args.reset_jmdict,
|
|
args.reset_nhk_articles,
|
|
args.reset_tatoeba,
|
|
args.reset_sentence_matches,
|
|
]):
|
|
changed_database = True
|
|
_reprocess_sentence_entry_matches(args)
|
|
|
|
if any([
|
|
args.reset_nhk_articles,
|
|
args.reset_nhk_statistics,
|
|
]):
|
|
changed_database = True
|
|
_recalculate_nhk_statistics(args)
|
|
|
|
if any([
|
|
args.reset_jmdict,
|
|
args.reset_nhk_articles,
|
|
args.reset_tatoeba,
|
|
args.reset_sentence_matches,
|
|
args.reset_nhk_statistics,
|
|
args.reset_difficulty_values,
|
|
]):
|
|
changed_database = True
|
|
_recalculate_difficulty_values(args)
|
|
|
|
if changed_database:
|
|
print('Recreated the database with newly processed data!')
|
|
print('You can now run the app without the --reset-db flag to start the webserver.')
|
|
|
|
else:
|
|
app = create_app(args)
|
|
app.run(debug=True)
|