Add project slides
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1682817260,
|
||||
"narHash": "sha256-kFMXzKNj4d/0Iqbm5l57rHSLyUeyCLMuvlROZIuuhvk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "db1e4eeb0f9a9028bcb920e00abbc1409dd3ef36",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"ref": "nixos-22.11",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
inputs.nixpkgs.url = "nixpkgs/nixos-22.11";
|
||||
|
||||
outputs = { self, nixpkgs }: let
|
||||
system = "x86_64-linux";
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
in {
|
||||
apps.${system} = let
|
||||
toApp = _: pkg: {
|
||||
type = "app";
|
||||
program = toString pkg;
|
||||
};
|
||||
in {
|
||||
default = {
|
||||
type = "app";
|
||||
program = toString (pkgs.writeScript "reveal-md-tdt4310-project" ''
|
||||
${pkgs.nodePackages.reveal-md}/bin/reveal-md main.md
|
||||
'');
|
||||
};
|
||||
};
|
||||
|
||||
devShells.${system}.default = pkgs.mkShell {
|
||||
packages = with pkgs; [
|
||||
nodePackages.reveal-md
|
||||
# inkscape
|
||||
# gimp
|
||||
# drawio
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
|
@ -0,0 +1,237 @@
|
|||
<link rel="stylesheet" href="./static/main.css"/>
|
||||
|
||||
### TDT 4310 - Intelligent Text Analysis Project
|
||||
|
||||
#### Sorting japanese sentences by linguistic complexity
|
||||
|
||||
----
|
||||
|
||||
### Overview
|
||||
|
||||
1. Introduction and motivation
|
||||
1. Background
|
||||
1. Datasets
|
||||
1. Methodology
|
||||
1. Evaluation
|
||||
1. Conclusion, and further work
|
||||
|
||||
---
|
||||
|
||||
<img src="./static/graphics/jst1.png" width="18%"/>
|
||||
<img src="./static/graphics/jst2.png" width="18%"/>
|
||||
<img src="./static/graphics/jst3.png" width="18%"/>
|
||||
|
||||
<br/>
|
||||
<br/>
|
||||
|
||||
<footer>Motivation</footer>
|
||||
|
||||
---
|
||||
|
||||
<div style="font-size: 0.8em">
|
||||
|
||||
| JMDict | Tatoeba / Tanaka corpus | NHK Easy News | MeCab |
|
||||
|--------|-------------------------|---------------|-------|
|
||||
| Open source dictionary | Multilingual sentence pairs | Easy-to-read news articles | POS and morphological analyzer |
|
||||
| <img src="./static/graphics/jmdict.png" width=100%/> | <img src="./static/graphics/tatoeba.png" width=100%/> | <img src="./static/graphics/nhk.png" width=100%/> | |
|
||||
|
||||
</div>
|
||||
|
||||
<br/>
|
||||
<br/>
|
||||
|
||||
|
||||
<footer>Datasets</footer>
|
||||
|
||||
---
|
||||
|
||||
#### TF-IDF
|
||||
|
||||
Extract the most meaningful words of a document
|
||||
|
||||
<br/>
|
||||
|
||||
#### Sense disambiguation
|
||||
|
||||
Pinpoint which sense of the word is used, based on surrounding context and grammar.
|
||||
|
||||
<footer>Background</footer>
|
||||
|
||||
----
|
||||
|
||||
### Japanese
|
||||
|
||||
<div class="grid">
|
||||
<div class="col-9">
|
||||
|
||||
#### Three writing systems
|
||||
|
||||
| <span style="color: red;">hiragana</span> | <span style="color: green;">katakana</span> | <span style="color: blue;">kanji</span> |
|
||||
|----------|----------|-------|
|
||||
| <img src="./static/graphics/hiragana.png"/> | <img src="./static/graphics/katakana.png"/> | <img src="./static/graphics/kanji2.png"/> |
|
||||
|
||||
</div>
|
||||
<div class="col-3">
|
||||
<div class="row-2">
|
||||
|
||||
<p>
|
||||
10
|
||||
<span style="color: green;">ページ</span>
|
||||
<span style="color: red;">の</span>
|
||||
5
|
||||
<span style="color: blue;">行目</span>
|
||||
<span style="color: red;">をみなさい</span>
|
||||
</p>
|
||||
|
||||
<p style="font-size: 0.8em;">
|
||||
<span style="color: red;">Let's start from</span>
|
||||
(the)
|
||||
fifth
|
||||
<span style="color: blue;">line</span>
|
||||
<span style="color: red;">on</span>
|
||||
<span style="color: green;">page</span>
|
||||
10
|
||||
</p>
|
||||
|
||||
|
||||
##### Multiple readings per kanji
|
||||
|
||||
形 - katachi, kata, gyou, kei
|
||||
|
||||
</div>
|
||||
<div class="row-1">
|
||||
|
||||
<br/>
|
||||
|
||||
##### Furigana
|
||||
|
||||
<ruby>
|
||||
振 <rp>(</rp><rt>furi</rt><rp>)</rp>
|
||||
仮 <rp>(</rp><rt>ga</rt><rp>)</rp>
|
||||
名 <rp>(</rp><rt>na</rt><rp>)</rp>
|
||||
<ruby>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<footer>Background</footer>
|
||||
|
||||
---
|
||||
|
||||
#### Data ingestion, preprocessing and disambiguation
|
||||
|
||||
<br/>
|
||||
|
||||
##### Tanaka Corpus
|
||||
|
||||
<p>
|
||||
信用█為る(する){して}█と█彼(かれ)[01]█は|1█言う{言った}
|
||||
</p>
|
||||
|
||||
<br/>
|
||||
|
||||
##### NHK News Articles
|
||||
|
||||
Scrape -> Extract text -> MeCab + Furigana -> Try disambiguating with POS
|
||||
|
||||
<footer>Methodology</footer>
|
||||
|
||||
Note:
|
||||
|
||||
Disambiguation here, is not necissarily sense ambiguation, but rather disambiguating the dictionary entry.
|
||||
|
||||
Could exploit the english translation to disambiguate all the way down to the word senses.
|
||||
|
||||
----
|
||||
|
||||
#### TF-IDF?
|
||||
|
||||
<br/>
|
||||
|
||||
<div>
|
||||
|
||||
$ \text{TF-IDF} = \frac{\text{Amount of term in doc}}{\text{Amount of terms in doc}} \cdot log \frac{\text{Amount of docs}}{1 + \text{ Amount of docs containing term}} $
|
||||
|
||||
</div>
|
||||
<br/>
|
||||
<div class="fragment" data-fragment-index="0">
|
||||
|
||||
$ \text{TF-DF} = \frac{AVG(\text{Amount of term in doc})}{\text{Amount of terms in doc}} \cdot \frac{\text{ Amount of docs containing term}}{\text{Amount of docs}} $
|
||||
|
||||
</div>
|
||||
|
||||
<footer>Methodology</footer>
|
||||
|
||||
Note:
|
||||
|
||||
TF-IDF is usually used for finding out how meaningful a word is to a document. Here, we want to do the opposite. The value should have a higher score, if it is more common across several documents.
|
||||
|
||||
----
|
||||
|
||||
#### Word difficulty
|
||||
|
||||
| Commonness | Dialects | Kanji | Katakana | NHK rating |
|
||||
|------------|----------|-------|----------|------------|
|
||||
| 25% | 10 % | 25% | 15% | 25% |
|
||||
| <img width="200px" src="./static/graphics/curves/common.png"> | <img width="200px" src="./static/graphics/curves/dialect.png"> | <img width="200px" src="./static/graphics/curves/kanji.png"> | <img width="200px" src="./static/graphics/curves/katakana.png"> | <img width="200px" src="./static/graphics/curves/nhk.png"> |
|
||||
|
||||
<footer>Methodology</footer>
|
||||
|
||||
----
|
||||
|
||||
#### Sentence difficulty
|
||||
|
||||
| Word difficulty sum | Hardest word | Sentence Length |
|
||||
|------------|----------|-------|
|
||||
| 50% | 20 % | 30% |
|
||||
| <img width="200px" src="./static/graphics/curves/wordsum.png"> | | <img width="200px" src="./static/graphics/curves/sentence_length.png"> |
|
||||
|
||||
|
||||
<footer>Methodology</footer>
|
||||
|
||||
---
|
||||
|
||||
<div class="columns">
|
||||
<div>
|
||||
<img width="80%" src="./static/graphics/examples/test1.png"/>
|
||||
</div>
|
||||
<div>
|
||||
<img width="90%" src="./static/graphics/examples/test2.png"/>
|
||||
</div>
|
||||
</dic>
|
||||
|
||||
<footer>Evaluation</footer>
|
||||
|
||||
----
|
||||
|
||||
<div class="columns">
|
||||
<div>
|
||||
<img width="90%" src="./static/graphics/examples/book1.png"/>
|
||||
</div>
|
||||
<div>
|
||||
<img width="100%" src="./static/graphics/examples/book2.png"/>
|
||||
</div>
|
||||
</dic>
|
||||
|
||||
<footer>Evaluation</footer>
|
||||
|
||||
----
|
||||
|
||||
|
||||
<ul>
|
||||
<div>
|
||||
<li>Apart from some bugs, the system seems to be working as intended</li>
|
||||
</div>
|
||||
<div class="fragment" data-fragment-index="0">
|
||||
<li>The factors should be more strongly grounded in linguistical research</li>
|
||||
</div>
|
||||
<div class="fragment" data-fragment-index="1">
|
||||
<li>Alternatively a dataset that would make it possible to evaluate the accuracy of the implementation</li>
|
||||
</div>
|
||||
<div class="fragment" data-fragment-index="2">
|
||||
<li>More data left unused.</li>
|
||||
</div>
|
||||
</ul>
|
||||
|
||||
<footer>Conclusion, and further work</footer>
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"highlightTheme": "monokai-sublime",
|
||||
"theme": "black"
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"theme": "black",
|
||||
"transition": "none",
|
||||
"controls": true,
|
||||
"progress": true,
|
||||
"keyboard": {"81": "toggleOverview"},
|
||||
"width": 1300
|
||||
}
|
After Width: | Height: | Size: 38 KiB |
After Width: | Height: | Size: 93 KiB |
After Width: | Height: | Size: 206 KiB |
After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 14 KiB |
|
@ -0,0 +1,42 @@
|
|||
import math
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
|
||||
|
||||
# Assumes 0 <= x <= 1
|
||||
def sigmoid(x, slope=0.1, offset=0, max_x=1, flip=False) -> float:
|
||||
assert x <= max_x
|
||||
x = x - (max_x / 2) - (offset * max_x / 2)
|
||||
s = 1 / (1 + math.exp(-x / slope))
|
||||
return max_x - s if flip else s
|
||||
|
||||
curve_dir = os.path.dirname(__file__)
|
||||
|
||||
for name, f in [
|
||||
('common', lambda x: sigmoid(x, slope=0.05, offset=-0.6, flip=True)),
|
||||
('dialect', lambda x: sigmoid(x, slope=0.08, offset=-0.2)),
|
||||
('kanji', lambda x: x ** 5),
|
||||
('katakana', lambda x: 0 if x > 0.5 else 1),
|
||||
('nhk', lambda x: sigmoid(x, slope=0.03, offset=-0.6, flip=True)),
|
||||
('wordsum', lambda x: x),
|
||||
]:
|
||||
plt.rc('font', size=33)
|
||||
plt.xlim(-0.05, 1.05)
|
||||
plt.ylim(-0.05, 1.05)
|
||||
plt.locator_params(nbins=2)
|
||||
space = np.linspace(0, 1, 1000)
|
||||
p = [f(n) for n in space] #
|
||||
plt.plot(space, p, linewidth=5)
|
||||
plt.savefig(f"{curve_dir}/{name}.png")
|
||||
plt.clf()
|
||||
|
||||
plt.rc('font', size=33)
|
||||
plt.xlim(-0.05, 24.05)
|
||||
plt.ylim(-0.05, 1.05)
|
||||
plt.locator_params(nbins=3)
|
||||
space = np.linspace(0, 24, 1000)
|
||||
p = [sigmoid(n, slope=1.4, max_x=24) for n in space] #
|
||||
plt.plot(space, p, linewidth=5)
|
||||
plt.savefig(f"{curve_dir}/sentence_length.png")
|
After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 6.4 KiB |
After Width: | Height: | Size: 12 KiB |
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 15 KiB |
After Width: | Height: | Size: 149 KiB |
After Width: | Height: | Size: 115 KiB |
After Width: | Height: | Size: 101 KiB |
After Width: | Height: | Size: 138 KiB |
After Width: | Height: | Size: 28 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 145 KiB |
After Width: | Height: | Size: 134 KiB |
After Width: | Height: | Size: 72 KiB |
After Width: | Height: | Size: 598 KiB |
After Width: | Height: | Size: 432 KiB |
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 71 KiB |
After Width: | Height: | Size: 66 KiB |
After Width: | Height: | Size: 102 KiB |
After Width: | Height: | Size: 215 KiB |
|
@ -0,0 +1,226 @@
|
|||
:root {
|
||||
--black : hsl(0, 0%, 0%);
|
||||
--black2 : hsl(60, 17%, 11%);
|
||||
--black3 : hsl(70, 8%, 15%);
|
||||
--blue : hsl(190, 81%, 67%);
|
||||
--grey : hsl(55, 8%, 26%);
|
||||
--orange : hsl(32, 98%, 56%);
|
||||
--orange2 : hsl(30, 83%, 34%);
|
||||
--orange3 : hsl(47, 100%, 79%);
|
||||
--purple : hsl(261, 100%, 75%);
|
||||
--red : hsl(0, 93%, 59%);
|
||||
--red2 : hsl(338, 95%, 56%);
|
||||
--white : hsl(0, 0%, 97%);
|
||||
--white2 : hsl(60, 36%, 96%);
|
||||
--white3 : hsl(60, 30%, 96%);
|
||||
--yellow : hsl(54, 70%, 68%);
|
||||
--yellow2 : hsl(80, 76%, 53%);
|
||||
--yellow3 : hsl(60, 12%, 79%);
|
||||
--yellow4 : hsl(55, 11%, 22%);
|
||||
--yellow5 : hsl(50, 11%, 41%);
|
||||
}
|
||||
|
||||
.red {
|
||||
color: red;
|
||||
}
|
||||
|
||||
.black { color: var(--black); }
|
||||
.black2 { color: var(--black2); }
|
||||
.black3 { color: var(--black3); }
|
||||
.blue { color: var(--blue); }
|
||||
.grey { color: var(--grey); }
|
||||
.orange { color: var(--orange); }
|
||||
.orange2 { color: var(--orange2); }
|
||||
.orange3 { color: var(--orange3); }
|
||||
.purple { color: var(--purple); }
|
||||
.red { color: var(--red); }
|
||||
.red2 { color: var(--red2); }
|
||||
.white { color: var(--white); }
|
||||
.white2 { color: var(--white2); }
|
||||
.white3 { color: var(--white3); }
|
||||
.yellow { color: var(--yellow); }
|
||||
.yellow2 { color: var(--yellow2); }
|
||||
.yellow3 { color: var(--yellow3); }
|
||||
.yellow4 { color: var(--yellow4); }
|
||||
.yellow5 { color: var(--yellow5); }
|
||||
|
||||
.bak-black { background-color: var(--black); }
|
||||
.bak-black2 { background-color: var(--black2); }
|
||||
.bak-black3 { background-color: var(--black3); }
|
||||
.bak-blue { background-color: var(--blue); }
|
||||
.bak-grey { background-color: var(--grey); }
|
||||
.bak-orange { background-color: var(--orange); }
|
||||
.bak-orange2 { background-color: var(--orange2); }
|
||||
.bak-orange3 { background-color: var(--orange3); }
|
||||
.bak-purple { background-color: var(--purple); }
|
||||
.bak-red { background-color: var(--red); }
|
||||
.bak-red2 { background-color: var(--red2); }
|
||||
.bak-white { background-color: var(--white); }
|
||||
.bak-white2 { background-color: var(--white2); }
|
||||
.bak-white3 { background-color: var(--white3); }
|
||||
.bak-yellow { background-color: var(--yellow); }
|
||||
.bak-yellow2 { background-color: var(--yellow2); }
|
||||
.bak-yellow3 { background-color: var(--yellow3); }
|
||||
.bak-yellow4 { background-color: var(--yellow4); }
|
||||
.bak-yellow5 { background-color: var(--yellow5); }
|
||||
|
||||
.columns {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.columns > div {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
/* Tallsystemer */
|
||||
.num-systems > div {
|
||||
font-size: 0.75em;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.num-system-title {
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
.num-span-red > span {
|
||||
color: var(--red);
|
||||
margin-left: 0.2em;
|
||||
}
|
||||
.num-span-blue > span {
|
||||
color: var(--blue);
|
||||
margin-left: 0.2em;
|
||||
}
|
||||
.num-span-green > span {
|
||||
color: var(--yellow2);
|
||||
margin-left: 0.2em;
|
||||
}
|
||||
.num-span-orange > span {
|
||||
color: var(--orange);
|
||||
margin-left: 0.2em;
|
||||
}
|
||||
|
||||
/* Fetch Decode Execute*/
|
||||
.fde-table-item {
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
||||
.fde-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(8, 1fr);
|
||||
column-gap: 0;
|
||||
row-gap: 0;
|
||||
justify-items: stretch;
|
||||
}
|
||||
|
||||
.fde-grid > * {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.fde-c2 { grid-column-start: 2; }
|
||||
.fde-c3 { grid-column-start: 3; }
|
||||
.fde-c4 { grid-column-start: 4; }
|
||||
.fde-c5 { grid-column-start: 5; }
|
||||
.fde-c6 { grid-column-start: 6; }
|
||||
.fde-c7 { grid-column-start: 7; }
|
||||
.fde-c8 { grid-column-start: 8; }
|
||||
|
||||
.fde-r2 { grid-row-start: 2; }
|
||||
.fde-r3 { grid-row-start: 3; }
|
||||
.fde-r4 { grid-row-start: 4; }
|
||||
.fde-r5 { grid-row-start: 5; }
|
||||
.fde-r6 { grid-row-start: 6; }
|
||||
.fde-r7 { grid-row-start: 7; }
|
||||
.fde-r8 { grid-row-start: 8; }
|
||||
|
||||
/* RGB */
|
||||
.rgb-pvv {color: #283681;}
|
||||
.rgb-red {color: #FF0000;}
|
||||
.rgb-gre {color: #00FF00;}
|
||||
.rgb-blu {color: #0000FF;}
|
||||
.rgb-cof {color: #C0FFEE;}
|
||||
.rgb-whi {color: #FFFFFF;}
|
||||
.rgb-bla {color: #000000;}
|
||||
|
||||
/* Network diagrams */
|
||||
.prev-netdiagram {
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
.net-title {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.net-title > h3 {
|
||||
display: inline;
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
/* Misc */
|
||||
|
||||
.replacable-fragment {
|
||||
position: relative;
|
||||
margin: auto;
|
||||
}
|
||||
|
||||
.replacable-fragment > div {
|
||||
position:absolute;
|
||||
top:0;
|
||||
/* TODO: fix this properly */
|
||||
left:25%;
|
||||
}
|
||||
|
||||
/* footer { */
|
||||
/* position: absolute; */
|
||||
/* left: 0; */
|
||||
/* bottom: 0; */
|
||||
/* width: 100%; */
|
||||
/* background-color: grey; */
|
||||
/* color: white; */
|
||||
/* text-align: center; */
|
||||
/* } */
|
||||
|
||||
.reveal {
|
||||
position: relative;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
.slides {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 60px; /* leave space for the footer */
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
footer {
|
||||
position: fixed;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
height: 60px;
|
||||
color: grey;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.col-9 {
|
||||
flex: 9;
|
||||
}
|
||||
|
||||
.col-3 {
|
||||
flex: 9;
|
||||
}
|
||||
|
||||
.row-1 {
|
||||
height: 50%;
|
||||
overflow: auto;
|
||||
}
|
||||
.row-2 {
|
||||
height: 50%;
|
||||
overflow: auto;
|
||||
}
|