Add project slides

2024-04-26 00:49:11 +02:00 · 2024-04-26 00:49:11 +02:00 · 211d590a35
commit 211d590a35
parent 4e030a510b
34 changed files with 574 additions and 0 deletions
--- a/project_slides/flake.lock
+++ b/project_slides/flake.lock
@ -0,0 +1,26 @@
+{
+  "nodes": {
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1682817260,
+        "narHash": "sha256-kFMXzKNj4d/0Iqbm5l57rHSLyUeyCLMuvlROZIuuhvk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "db1e4eeb0f9a9028bcb920e00abbc1409dd3ef36",
+        "type": "github"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "ref": "nixos-22.11",
+        "type": "indirect"
+      }
+    },
+    "root": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/project_slides/flake.nix
+++ b/project_slides/flake.nix
@ -0,0 +1,31 @@
+{
+  inputs.nixpkgs.url = "nixpkgs/nixos-22.11";
+
+  outputs = { self, nixpkgs }: let
+    system = "x86_64-linux";
+    pkgs = nixpkgs.legacyPackages.${system};
+  in {
+    apps.${system} = let
+      toApp = _: pkg: {
+        type = "app";
+        program = toString pkg;
+      };
+    in {
+      default = {
+        type = "app";
+        program = toString (pkgs.writeScript "reveal-md-tdt4310-project" ''
+          ${pkgs.nodePackages.reveal-md}/bin/reveal-md main.md
+        '');
+      };
+    };
+
+    devShells.${system}.default = pkgs.mkShell {
+      packages = with pkgs; [
+        nodePackages.reveal-md
+        # inkscape
+        # gimp
+        # drawio
+      ];
+    };
+  };
+}
--- a/project_slides/main.md
+++ b/project_slides/main.md
@ -0,0 +1,237 @@
+<link rel="stylesheet" href="./static/main.css"/>
+
+### TDT 4310 - Intelligent Text Analysis Project
+
+#### Sorting japanese sentences by linguistic complexity
+
+----
+
+### Overview
+
+1. Introduction and motivation
+1. Background
+1. Datasets
+1. Methodology
+1. Evaluation
+1. Conclusion, and further work
+
+---
+
+<img src="./static/graphics/jst1.png" width="18%"/>
+<img src="./static/graphics/jst2.png" width="18%"/>
+<img src="./static/graphics/jst3.png" width="18%"/>
+
+<br/>
+<br/>
+
+<footer>Motivation</footer>
+
+---
+
+<div style="font-size: 0.8em">
+
+| JMDict | Tatoeba / Tanaka corpus | NHK Easy News | MeCab |
+|--------|-------------------------|---------------|-------|
+| Open source dictionary | Multilingual sentence pairs | Easy-to-read news articles | POS and morphological analyzer |
+| <img src="./static/graphics/jmdict.png" width=100%/> | <img src="./static/graphics/tatoeba.png" width=100%/> | <img src="./static/graphics/nhk.png" width=100%/> | |
+
+</div>
+
+<br/>
+<br/>
+
+
+<footer>Datasets</footer>
+
+---
+
+#### TF-IDF
+
+Extract the most meaningful words of a document
+
+<br/>
+
+#### Sense disambiguation
+
+Pinpoint which sense of the word is used, based on surrounding context and grammar.
+
+<footer>Background</footer>
+
+----
+
+### Japanese
+
+<div class="grid">
+<div class="col-9">
+
+#### Three writing systems
+
+| <span style="color: red;">hiragana</span> | <span style="color: green;">katakana</span> | <span style="color: blue;">kanji</span> |
+|----------|----------|-------|
+| <img src="./static/graphics/hiragana.png"/> | <img src="./static/graphics/katakana.png"/> | <img src="./static/graphics/kanji2.png"/> |
+
+</div>
+<div class="col-3">
+<div class="row-2">
+
+<p>
+  １０
+  <span style="color: green;">ページ</span>
+  <span style="color: red;">の</span>
+  ５
+  <span style="color: blue;">行目</span>
+  <span style="color: red;">をみなさい</span>
+</p>
+
+<p style="font-size: 0.8em;">
+  <span style="color: red;">Let's start from</span>
+  (the)
+  fifth
+  <span style="color: blue;">line</span>
+  <span style="color: red;">on</span>
+  <span style="color: green;">page</span>
+  10
+</p>
+
+
+##### Multiple readings per kanji
+
+形 - katachi, kata, gyou, kei
+
+</div>
+<div class="row-1">
+
+<br/>
+
+##### Furigana
+
+<ruby>
+  振 <rp>(</rp><rt>furi</rt><rp>)</rp>
+  仮 <rp>(</rp><rt>ga</rt><rp>)</rp>
+  名 <rp>(</rp><rt>na</rt><rp>)</rp>
+<ruby>
+
+</div>
+</div>
+</div>
+
+<footer>Background</footer>
+
+---
+
+#### Data ingestion, preprocessing and disambiguation
+
+<br/>
+
+##### Tanaka Corpus
+
+<p>
+信用█為る(する){して}█と█彼(かれ)[01]█は|1█言う{言った}
+</p>
+
+<br/>
+
+##### NHK News Articles
+
+Scrape -> Extract text -> MeCab + Furigana -> Try disambiguating with POS
+
+<footer>Methodology</footer>
+
+Note:
+
+Disambiguation here, is not necissarily sense ambiguation, but rather disambiguating the dictionary entry.
+
+Could exploit the english translation to disambiguate all the way down to the word senses.
+
+----
+
+#### TF-IDF?
+
+<br/>
+
+<div>
+
+$ \text{TF-IDF} = \frac{\text{Amount of term in doc}}{\text{Amount of terms in doc}} \cdot log \frac{\text{Amount of docs}}{1 + \text{ Amount of docs containing term}} $
+
+</div>
+<br/>
+<div class="fragment" data-fragment-index="0">
+
+$ \text{TF-DF} = \frac{AVG(\text{Amount of term in doc})}{\text{Amount of terms in doc}} \cdot \frac{\text{ Amount of docs containing term}}{\text{Amount of docs}} $
+
+</div>
+
+<footer>Methodology</footer>
+
+Note:
+
+TF-IDF is usually used for finding out how meaningful a word is to a document. Here, we want to do the opposite. The value should have a higher score, if it is more common across several documents.
+
+----
+
+#### Word difficulty
+
+| Commonness | Dialects | Kanji | Katakana | NHK rating |
+|------------|----------|-------|----------|------------|
+| 25%        | 10 %     | 25%   | 15%      | 25%        |
+| <img width="200px" src="./static/graphics/curves/common.png"> | <img width="200px" src="./static/graphics/curves/dialect.png"> | <img width="200px" src="./static/graphics/curves/kanji.png"> | <img width="200px" src="./static/graphics/curves/katakana.png"> | <img width="200px" src="./static/graphics/curves/nhk.png"> |
+
+<footer>Methodology</footer>
+
+----
+
+#### Sentence difficulty
+
+| Word difficulty sum | Hardest word | Sentence Length |
+|------------|----------|-------|
+| 50%        | 20 %     | 30%   |
+| <img width="200px" src="./static/graphics/curves/wordsum.png"> |  | <img width="200px" src="./static/graphics/curves/sentence_length.png"> |
+
+
+<footer>Methodology</footer>
+
+---
+
+<div class="columns">
+  <div>
+    <img width="80%" src="./static/graphics/examples/test1.png"/>
+  </div>
+  <div>
+    <img width="90%" src="./static/graphics/examples/test2.png"/>
+  </div>
+</dic>
+
+<footer>Evaluation</footer>
+
+----
+
+<div class="columns">
+  <div>
+    <img width="90%" src="./static/graphics/examples/book1.png"/>
+  </div>
+  <div>
+    <img width="100%" src="./static/graphics/examples/book2.png"/>
+  </div>
+</dic>
+
+<footer>Evaluation</footer>
+
+----
+
+
+<ul>
+<div>
+<li>Apart from some bugs, the system seems to be working as intended</li>
+</div>
+<div class="fragment" data-fragment-index="0">
+<li>The factors should be more strongly grounded in linguistical research</li>
+</div>
+<div class="fragment" data-fragment-index="1">
+<li>Alternatively a dataset that would make it possible to evaluate the accuracy of the implementation</li>
+</div>
+<div class="fragment" data-fragment-index="2">
+<li>More data left unused.</li>
+</div>
+</ul>
+
+<footer>Conclusion, and further work</footer>
--- a/project_slides/reveal-md.json
+++ b/project_slides/reveal-md.json
@ -0,0 +1,4 @@
+{
+  "highlightTheme": "monokai-sublime",
+  "theme": "black"
+}
--- a/project_slides/reveal.json
+++ b/project_slides/reveal.json
@ -0,0 +1,8 @@
+{
+  "theme": "black",
+  "transition": "none",
+  "controls": true,
+  "progress": true,
+  "keyboard": {"81": "toggleOverview"},
+  "width": 1300
+}
--- a/project_slides/static/graphics/book1.png
+++ b/project_slides/static/graphics/book1.png
--- a/project_slides/static/graphics/book2.png
+++ b/project_slides/static/graphics/book2.png
--- a/project_slides/static/graphics/book3.png
+++ b/project_slides/static/graphics/book3.png
--- a/project_slides/static/graphics/curves/common.png
+++ b/project_slides/static/graphics/curves/common.png
--- a/project_slides/static/graphics/curves/dialect.png
+++ b/project_slides/static/graphics/curves/dialect.png
--- a/project_slides/static/graphics/curves/generate_curves.py
+++ b/project_slides/static/graphics/curves/generate_curves.py
@ -0,0 +1,42 @@
+import math
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import os
+
+
+# Assumes 0 <= x <= 1
+def sigmoid(x, slope=0.1, offset=0, max_x=1, flip=False) -> float:
+    assert x <= max_x
+    x = x - (max_x / 2) - (offset * max_x / 2)
+    s = 1 / (1 + math.exp(-x / slope))
+    return max_x - s if flip else s
+
+curve_dir = os.path.dirname(__file__)
+
+for name, f in [
+  ('common', lambda x: sigmoid(x, slope=0.05, offset=-0.6, flip=True)),
+  ('dialect', lambda x: sigmoid(x, slope=0.08, offset=-0.2)),
+  ('kanji', lambda x: x ** 5),
+  ('katakana', lambda x: 0 if x > 0.5 else 1),
+  ('nhk', lambda x: sigmoid(x, slope=0.03, offset=-0.6, flip=True)),
+  ('wordsum', lambda x: x),
+]:
+  plt.rc('font', size=33)
+  plt.xlim(-0.05, 1.05)
+  plt.ylim(-0.05, 1.05)
+  plt.locator_params(nbins=2)
+  space = np.linspace(0, 1, 1000)
+  p = [f(n) for n in space] # 
+  plt.plot(space, p, linewidth=5)
+  plt.savefig(f"{curve_dir}/{name}.png")
+  plt.clf()
+
+plt.rc('font', size=33)
+plt.xlim(-0.05, 24.05)
+plt.ylim(-0.05, 1.05)
+plt.locator_params(nbins=3)
+space = np.linspace(0, 24, 1000)
+p = [sigmoid(n, slope=1.4, max_x=24) for n in space] # 
+plt.plot(space, p, linewidth=5)
+plt.savefig(f"{curve_dir}/sentence_length.png")
--- a/project_slides/static/graphics/curves/kanji.png
+++ b/project_slides/static/graphics/curves/kanji.png
--- a/project_slides/static/graphics/curves/katakana.png
+++ b/project_slides/static/graphics/curves/katakana.png
--- a/project_slides/static/graphics/curves/nhk.png
+++ b/project_slides/static/graphics/curves/nhk.png
--- a/project_slides/static/graphics/curves/sentence_length.png
+++ b/project_slides/static/graphics/curves/sentence_length.png
--- a/project_slides/static/graphics/curves/wordsum.png
+++ b/project_slides/static/graphics/curves/wordsum.png
--- a/project_slides/static/graphics/examples/book1.png
+++ b/project_slides/static/graphics/examples/book1.png
--- a/project_slides/static/graphics/examples/book2.png
+++ b/project_slides/static/graphics/examples/book2.png
--- a/project_slides/static/graphics/examples/test1.png
+++ b/project_slides/static/graphics/examples/test1.png
--- a/project_slides/static/graphics/examples/test2.png
+++ b/project_slides/static/graphics/examples/test2.png
--- a/project_slides/static/graphics/hiragana.png
+++ b/project_slides/static/graphics/hiragana.png
--- a/project_slides/static/graphics/jmdict.png
+++ b/project_slides/static/graphics/jmdict.png
--- a/project_slides/static/graphics/jst1.png
+++ b/project_slides/static/graphics/jst1.png
--- a/project_slides/static/graphics/jst2.png
+++ b/project_slides/static/graphics/jst2.png
--- a/project_slides/static/graphics/jst3.png
+++ b/project_slides/static/graphics/jst3.png
--- a/project_slides/static/graphics/kanji.png
+++ b/project_slides/static/graphics/kanji.png
--- a/project_slides/static/graphics/kanji2.png
+++ b/project_slides/static/graphics/kanji2.png
--- a/project_slides/static/graphics/katakana.png
+++ b/project_slides/static/graphics/katakana.png
--- a/project_slides/static/graphics/nhk.png
+++ b/project_slides/static/graphics/nhk.png
--- a/project_slides/static/graphics/ntnu_uten_slagord_hvit-2.pdf
+++ b/project_slides/static/graphics/ntnu_uten_slagord_hvit-2.pdf
--- a/project_slides/static/graphics/tatoeba.png
+++ b/project_slides/static/graphics/tatoeba.png
--- a/project_slides/static/graphics/test1.png
+++ b/project_slides/static/graphics/test1.png
--- a/project_slides/static/graphics/test2.png
+++ b/project_slides/static/graphics/test2.png
--- a/project_slides/static/main.css
+++ b/project_slides/static/main.css
@ -0,0 +1,226 @@
+:root {
+  --black : hsl(0, 0%, 0%);
+  --black2 : hsl(60, 17%, 11%);
+  --black3 : hsl(70, 8%, 15%);
+  --blue : hsl(190, 81%, 67%);
+  --grey : hsl(55, 8%, 26%);
+  --orange : hsl(32, 98%, 56%);
+  --orange2 : hsl(30, 83%, 34%);
+  --orange3 : hsl(47, 100%, 79%);
+  --purple : hsl(261, 100%, 75%);
+  --red : hsl(0, 93%, 59%);
+  --red2 : hsl(338, 95%, 56%);
+  --white : hsl(0, 0%, 97%);
+  --white2 : hsl(60, 36%, 96%);
+  --white3 : hsl(60, 30%, 96%);
+  --yellow : hsl(54, 70%, 68%);
+  --yellow2 : hsl(80, 76%, 53%);
+  --yellow3 : hsl(60, 12%, 79%);
+  --yellow4 : hsl(55, 11%, 22%);
+  --yellow5 : hsl(50, 11%, 41%);
+}
+
+.red {
+	color: red;
+}
+
+.black   { color: var(--black); }
+.black2  { color: var(--black2); }
+.black3  { color: var(--black3); }
+.blue    { color: var(--blue); }
+.grey    { color: var(--grey); }
+.orange  { color: var(--orange); }
+.orange2 { color: var(--orange2); }
+.orange3 { color: var(--orange3); }
+.purple  { color: var(--purple); }
+.red     { color: var(--red); }
+.red2    { color: var(--red2); }
+.white   { color: var(--white); }
+.white2  { color: var(--white2); }
+.white3  { color: var(--white3); }
+.yellow  { color: var(--yellow); }
+.yellow2 { color: var(--yellow2); }
+.yellow3 { color: var(--yellow3); }
+.yellow4 { color: var(--yellow4); }
+.yellow5 { color: var(--yellow5); }
+
+.bak-black   { background-color: var(--black); }
+.bak-black2  { background-color: var(--black2); }
+.bak-black3  { background-color: var(--black3); }
+.bak-blue    { background-color: var(--blue); }
+.bak-grey    { background-color: var(--grey); }
+.bak-orange  { background-color: var(--orange); }
+.bak-orange2 { background-color: var(--orange2); }
+.bak-orange3 { background-color: var(--orange3); }
+.bak-purple  { background-color: var(--purple); }
+.bak-red     { background-color: var(--red); }
+.bak-red2    { background-color: var(--red2); }
+.bak-white   { background-color: var(--white); }
+.bak-white2  { background-color: var(--white2); }
+.bak-white3  { background-color: var(--white3); }
+.bak-yellow  { background-color: var(--yellow); }
+.bak-yellow2 { background-color: var(--yellow2); }
+.bak-yellow3 { background-color: var(--yellow3); }
+.bak-yellow4 { background-color: var(--yellow4); }
+.bak-yellow5 { background-color: var(--yellow5); }
+
+.columns {
+	display: flex;
+}
+
+.columns > div {
+	flex: 1;
+}
+
+/* Tallsystemer */
+.num-systems > div {
+  font-size: 0.75em;
+  text-align: left;
+}
+
+.num-system-title {
+  font-size: 1.5em;
+}
+
+.num-span-red > span {
+  color: var(--red);
+  margin-left: 0.2em;
+}
+.num-span-blue > span {
+  color: var(--blue);
+  margin-left: 0.2em;
+}
+.num-span-green > span {
+  color: var(--yellow2);
+  margin-left: 0.2em;
+}
+.num-span-orange > span {
+  color: var(--orange);
+  margin-left: 0.2em;
+}
+
+/* Fetch Decode Execute*/
+.fde-table-item {
+  padding: 0.5em;
+}
+
+.fde-grid {
+  display: grid;
+  grid-template-columns: repeat(8, 1fr);
+  column-gap: 0;
+  row-gap: 0;
+  justify-items: stretch;
+}
+
+.fde-grid > * {
+  margin: 0;
+}
+
+.fde-c2 { grid-column-start: 2; }
+.fde-c3 { grid-column-start: 3; }
+.fde-c4 { grid-column-start: 4; }
+.fde-c5 { grid-column-start: 5; }
+.fde-c6 { grid-column-start: 6; }
+.fde-c7 { grid-column-start: 7; }
+.fde-c8 { grid-column-start: 8; }
+
+.fde-r2 { grid-row-start: 2; }
+.fde-r3 { grid-row-start: 3; }
+.fde-r4 { grid-row-start: 4; }
+.fde-r5 { grid-row-start: 5; }
+.fde-r6 { grid-row-start: 6; }
+.fde-r7 { grid-row-start: 7; }
+.fde-r8 { grid-row-start: 8; }
+
+/* RGB */
+.rgb-pvv {color: #283681;}
+.rgb-red {color: #FF0000;}
+.rgb-gre {color: #00FF00;}
+.rgb-blu {color: #0000FF;}
+.rgb-cof {color: #C0FFEE;}
+.rgb-whi {color: #FFFFFF;}
+.rgb-bla {color: #000000;}
+
+/* Network diagrams */
+.prev-netdiagram {
+  opacity: 0.5;
+}
+
+.net-title {
+  display: flex;
+  justify-content: center;
+  align-items: center;
+}
+
+.net-title > h3 {
+  display: inline;
+  padding: 1em;
+}
+
+/* Misc */
+
+.replacable-fragment {
+	position: relative;
+  margin: auto;
+}
+
+.replacable-fragment > div {
+  position:absolute;
+	top:0;
+	/* TODO: fix this properly */
+	left:25%;
+}
+
+/* footer { */
+/*   position: absolute; */
+/*   left: 0; */
+/*   bottom: 0; */
+/*   width: 100%; */
+/*   background-color: grey; */
+/*   color: white; */
+/*   text-align: center; */
+/* } */
+
+.reveal {
+  position: relative;
+  height: 100%;
+}
+
+.slides {
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 60px; /* leave space for the footer */
+  overflow: auto;
+}
+
+footer {
+  position: fixed;
+  bottom: 0;
+  left: 0;
+  right: 0;
+  height: 60px;
+  color: grey;
+}
+
+.grid {
+  display: flex;
+}
+
+.col-9 {
+  flex: 9;
+}
+
+.col-3 {
+  flex: 9;
+}
+
+.row-1 {
+  height: 50%;
+  overflow: auto;
+}
+.row-2 {
+  height: 50%;
+  overflow: auto;
+}