Initial commit

This commit is contained in:
Oystein Kristoffer Tveit 2024-12-26 00:16:24 +01:00
commit 02de85a585
Signed by: oysteikt
GPG Key ID: 9F2F7D8250F35146
9 changed files with 1350 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/target
result

716
Cargo.lock generated Normal file
View File

@ -0,0 +1,716 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys",
]
[[package]]
name = "anyhow"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "console"
version = "0.15.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
dependencies = [
"encode_unicode",
"libc",
"once_cell",
"unicode-width",
"windows-sys",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "darling"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "deunicode"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00"
[[package]]
name = "dummy"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3ee4e39146145f7dd28e6c85ffdce489d93c0d9c88121063b8aacabbd9858d2"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "encode_unicode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "env_filter"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
dependencies = [
"log",
"regex",
]
[[package]]
name = "env_logger"
version = "0.11.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0"
dependencies = [
"anstream",
"anstyle",
"env_filter",
"humantime",
"log",
]
[[package]]
name = "fake"
version = "3.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "661cb0601b5f4050d1e65452c5b0ea555c0b3e88fb5ed7855906adc6c42523ef"
dependencies = [
"deunicode",
"dummy",
"rand",
]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "getrandom"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "indicatif"
version = "0.17.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
dependencies = [
"console",
"number_prefix",
"portable-atomic",
"unicode-width",
"web-time",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "js-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]]
name = "ppv-lite86"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quick-xml"
version = "0.37.1"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "serde"
version = "1.0.216"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.216"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "singlethreaded-sax-multithreaded-parse"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"env_logger",
"fake",
"indicatif",
"log",
"quick-xml",
"rayon",
"regex",
"serde",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "unicode-width"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"byteorder",
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

20
Cargo.toml Normal file
View File

@ -0,0 +1,20 @@
[package]
name = "singlethreaded-sax-multithreaded-parse"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.95"
clap = { version = "4.5.23", features = ["derive"] }
env_logger = "0.11.6"
fake = { version = "3.0.1", features = ["derive"] }
indicatif = "0.17.9"
log = "0.4.22"
# quick-xml = { version = "0.37.1", features = ["serde", "serialize"] }
quick-xml = { path = "/home/h7x4/git/quick-xml", features = ["serde", "serialize"] }
rayon = "1.10.0"
regex = "1.11.1"
serde = { version = "1.0.216", features = ["derive"] }
[profile.release]
debug = 1

61
flake.lock generated Normal file
View File

@ -0,0 +1,61 @@
{
"nodes": {
"jmdict-src": {
"flake": false,
"locked": {
"narHash": "sha256-QhbMFVI/yEvz/xq5flhCt0rg5rHObLnXjo1bGNSGwa8=",
"type": "file",
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"
},
"original": {
"type": "file",
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1734649271,
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"jmdict-src": "jmdict-src",
"nixpkgs": "nixpkgs",
"rust-overlay": "rust-overlay"
}
},
"rust-overlay": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1734834660,
"narHash": "sha256-bm8V+Cu8rWJA+vKQnc94mXTpSDgvedyoDKxTVi/uJfw=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "b070e6030118680977bc2388868c4b3963872134",
"type": "github"
},
"original": {
"owner": "oxalica",
"repo": "rust-overlay",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

61
flake.nix Normal file
View File

@ -0,0 +1,61 @@
{
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
rust-overlay.url = "github:oxalica/rust-overlay";
rust-overlay.inputs.nixpkgs.follows = "nixpkgs";
jmdict-src.url = "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz";
jmdict-src.flake = false;
};
outputs = { self, nixpkgs, rust-overlay, jmdict-src }:
let
inherit (nixpkgs) lib;
systems = [
"x86_64-linux"
"aarch64-linux"
"x86_64-darwin"
"aarch64-darwin"
];
forAllSystems = f: lib.genAttrs systems (system: let
pkgs = import nixpkgs {
inherit system;
overlays = [
(import rust-overlay)
];
};
rust-bin = rust-overlay.lib.mkRustBin { } pkgs.buildPackages;
toolchain = rust-bin.stable.latest.default.override {
extensions = [ "rust-src" "rust-analyzer" "rust-std" ];
};
in f system pkgs toolchain);
in {
devShells = forAllSystems (system: pkgs: toolchain: {
default = pkgs.mkShell {
nativeBuildInputs = [
toolchain
pkgs.cargo-flamegraph
];
RUST_SRC_PATH = "${toolchain}/lib/rustlib/src/rust/library";
};
});
packages = forAllSystems (system: pkgs: toolchain: {
jmdict = pkgs.runCommand "jmdict" {
nativeBuildInputs = with pkgs; [
gzip
xmlformat
];
} ''
mkdir -p "$out"
gzip -dkc ${jmdict-src} > "$out/JMdict.xml"
xmlformat -i "$out/JMdict.xml"
'';
});
};
}

100
src/data.rs Normal file
View File

@ -0,0 +1,100 @@
use fake::Dummy;
use serde::{Deserialize, Serialize};
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct KanjiElement {
#[serde(rename = "keb")]
reading: String,
// news: Option<u8>,
// ichi: Option<u8>,
// spec: Option<u8>,
// gai: Option<u8>,
// nf: Option<u8>,
#[serde(rename = "ke_pri", default)]
priorities: Vec<String>,
#[serde(rename = "ke_inf", default)]
info: Vec<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct ReadingElement {
#[serde(rename = "reb")]
reading: String,
// news: Option<u8>,
// ichi: Option<u8>,
// spec: Option<u8>,
// gai: Option<u8>,
// nf: Option<u8>,
#[serde(rename = "re_pri", default)]
priorities: Vec<String>,
#[serde(rename = "re_inf", default)]
info: Vec<String>,
#[serde(rename = "re_restr", default)]
restrictions: Vec<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct LanguageSource {
language: String,
phrase: Option<String>,
fully_describes_sense: bool,
constructed_from_smaller_words: bool,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct Glossary {
language: String,
phrase: String,
r#type: Option<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct XRefParts {
kanji_ref: Option<String>,
reading_ref: Option<String>,
sense_num: Option<i32>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct XRef {
entry_id: String,
reading: String,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct Sense {
id: u64,
antonyms: Vec<XRefParts>,
dialects: Vec<String>,
fields: Vec<String>,
info: Vec<String>,
language_source: Vec<LanguageSource>,
glossary: Vec<Glossary>,
misc: Vec<String>,
pos: Vec<String>,
restricted_to_kanji: Vec<String>,
restricted_to_reading: Vec<String>,
see_also: Vec<XRefParts>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct Entry {
#[serde(rename = "ent_seq", default)]
id: u64,
#[serde(rename = "k_ele", default)]
kanji: Vec<KanjiElement>,
#[serde(rename = "r_ele", default)]
readings: Vec<ReadingElement>,
#[serde(rename = "sense", default)]
senses: Vec<()>,
}

View File

@ -0,0 +1,46 @@
use quick_xml::{de::EntityResolver, events::BytesText};
use regex::bytes::Regex;
use std::{collections::BTreeMap, string::FromUtf8Error};
#[derive(Debug, Clone)]
pub struct DocTypeEntityResolver {
re: Regex,
map: BTreeMap<String, String>,
}
impl Default for DocTypeEntityResolver {
fn default() -> Self {
let mut map = BTreeMap::new();
map.insert("lt".to_string(), "<".to_string());
map.insert("gt".to_string(), ">".to_string());
map.insert("amp".to_string(), "&".to_string());
map.insert("apos".to_string(), "'".to_string());
map.insert("quot".to_string(), "\"".to_string());
Self {
// We do not focus on true parsing in this example
// You should use special libraries to parse DTD
re: Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#).unwrap(),
map,
}
}
}
impl EntityResolver for DocTypeEntityResolver {
type Error = FromUtf8Error;
fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> {
for cap in self.re.captures_iter(&doctype) {
self.map.insert(
String::from_utf8(cap[1].to_vec())?,
String::from_utf8(cap[2].to_vec())?,
);
}
Ok(())
}
fn resolve(&self, entity: &str) -> Option<&str> {
self.map.get(entity).map(|s| s.as_str())
}
}

91
src/event_queue_reader.rs Normal file
View File

@ -0,0 +1,91 @@
use std::collections::VecDeque;
use quick_xml::{de::{PayloadEvent, XmlRead}, events::Event, DeError, Decoder};
/// from raw events to semi-trimmed events that is independent from a way of
/// events reading.
struct StartTrimmer {
/// If `true`, then leading whitespace will be removed from next returned
/// [`Event::Text`]. This field is set to `true` after reading each event
/// except [`Event::Text`] and [`Event::CData`], so [`Event::Text`] events
/// read right after them does not trimmed.
trim_start: bool,
}
impl StartTrimmer {
/// Converts raw reader's event into a payload event.
/// Returns `None`, if event should be skipped.
#[inline(always)]
fn trim<'a>(&mut self, event: Event<'a>) -> Option<PayloadEvent<'a>> {
let (event, trim_next_event) = match event {
Event::DocType(e) => (PayloadEvent::DocType(e), true),
Event::Start(e) => (PayloadEvent::Start(e), true),
Event::End(e) => (PayloadEvent::End(e), true),
Event::Eof => (PayloadEvent::Eof, true),
// Do not trim next text event after Text or CDATA event
Event::CData(e) => (PayloadEvent::CData(e), false),
Event::Text(mut e) => {
// If event is empty after trimming, skip it
if self.trim_start && e.inplace_trim_start() {
return None;
}
(PayloadEvent::Text(e), false)
}
_ => return None,
};
self.trim_start = trim_next_event;
Some(event)
}
}
impl Default for StartTrimmer {
#[inline]
fn default() -> Self {
Self { trim_start: true }
}
}
pub struct EventReader<'i> {
events: VecDeque<Event<'i>>,
start_trimmer: StartTrimmer,
}
impl<'i> EventReader<'i> {
pub fn new(events: VecDeque<Event<'i>>) -> Self {
Self {
events,
start_trimmer: Default::default(),
}
}
}
impl<'i> XmlRead<'i> for EventReader<'i> {
fn next(&mut self) -> Result<quick_xml::de::PayloadEvent<'i>, DeError> {
loop {
// TODO: fix the returned error
let event = self.events.pop_front().ok_or(DeError::UnexpectedEof)?;
// let event = self.events
if let Some(event) = self.start_trimmer.trim(event) {
return Ok(event);
}
}
}
fn read_to_end(&mut self, name: quick_xml::name::QName) -> Result<(), DeError> {
while let Some(event) = self.events.pop_front() {
if let Event::End(ref e) = event {
if e.name() == name {
return Ok(());
}
}
}
Err(DeError::UnexpectedEof)
}
fn decoder(&self) -> Decoder {
Decoder { }
}
}

252
src/main.rs Normal file
View File

@ -0,0 +1,252 @@
use std::{
collections::VecDeque,
fs::File,
io::{BufRead, BufReader, Seek, SeekFrom},
thread,
time::Instant,
};
use clap::Parser;
use data::Entry;
use doctype_entity_resolver::DocTypeEntityResolver;
use event_queue_reader::EventReader;
use fake::{Fake, Faker};
use indicatif::{ProgressBar, ProgressStyle};
use quick_xml::{
de::{Deserializer, EntityResolver},
events::Event,
DeError, Reader,
};
use rayon::iter::{ParallelBridge, ParallelIterator};
use serde::Deserialize;
use std::sync::mpsc::channel;
mod data;
mod doctype_entity_resolver;
mod event_queue_reader;
#[derive(Parser)]
struct Cli {
#[command(subcommand)]
cmd: Command,
}
#[derive(Parser)]
enum Command {
#[command()]
Parse(ParseArgs),
#[command()]
Generate,
}
#[derive(Parser)]
struct ParseArgs {
#[arg(short, long)]
file: String,
}
fn main() {
let args = Cli::parse();
env_logger::init();
match args.cmd {
Command::Parse(parse) => {
log::info!("Parsing file: {}", parse.file);
parse_xml(&parse.file).unwrap();
}
Command::Generate => {
log::info!("Generating random data");
let data: Vec<Entry> = (0..10).map(|_| Faker.fake()).collect();
let xml = quick_xml::se::to_string_with_root("data", &data).unwrap();
println!("{}", xml);
}
}
}
// TODO: can we avoid into_owned here?
fn parse_events_for_tag<'i, R>(
reader: &mut Reader<R>,
tagname: &str,
) -> Result<VecDeque<Event<'i>>, anyhow::Error>
where
R: BufRead,
{
let mut events = VecDeque::new();
loop {
let mut buf = Vec::new();
let event = { reader.read_event_into(&mut buf)? };
match event {
Event::Start(ref e) if e.name().as_ref() == tagname.as_bytes() => {
events.push_back(event.into_owned());
break;
}
Event::Eof => {
// TODO: fix
anyhow::bail!("EOF");
}
_ => {}
}
}
loop {
let mut buf = Vec::new();
let event = reader.read_event_into(&mut buf)?;
match event {
Event::End(ref e) if e.name().as_ref() == tagname.as_bytes() => {
events.push_back(event.into_owned());
break;
}
_ => {
events.push_back(event.into_owned());
}
}
}
Ok(events)
}
fn parse_entry_from_events(
events: VecDeque<Event<'_>>,
resolver: &DocTypeEntityResolver,
) -> anyhow::Result<Entry> {
let reader = EventReader::new(events);
// NOTE: this constructor is made `pub` in a local fork, this is not
// `pub` with the upstream crate
let mut deserializer = Deserializer::new(reader, resolver.clone());
Entry::deserialize(&mut deserializer).map_err(|e| e.into())
}
fn singlethreaded_parser(
reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
let mut buf_reader = reader.into_inner();
let mut deserializer =
quick_xml::de::Deserializer::with_resolver(&mut buf_reader, resolver.clone());
loop {
let entry = Entry::deserialize(&mut deserializer);
if let Err(DeError::UnexpectedEof) = entry {
break Ok(());
}
progress_bar.set_position(deserializer.get_ref().get_ref().buffer_position());
}
}
fn single_threaded_event_list_parser(
mut reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
loop {
let events = parse_events_for_tag(&mut reader, "entry")?;
progress_bar.set_position(reader.buffer_position());
let entry = parse_entry_from_events(events.clone(), &resolver);
// println!("{:?}", entry);
// println!("{:?}\n", events);
}
}
fn multithreaded_parser(
mut reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
let (sender, receiver) = channel();
let handle1 = thread::spawn(move || loop {
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
progress_bar.set_position(reader.buffer_position());
sender.send(events).unwrap();
});
// TODO: more threads
let handle2 = thread::spawn(move || {
receiver.into_iter().for_each(|event_list| {
let entry = parse_entry_from_events(event_list, &resolver);
// println!("{:?}", entry);
});
});
println!("Waiting for threads to finish");
handle1.join();
println!("First thread finished");
handle2.join();
println!("Threads finished");
Ok(())
}
fn multithreaded_rayon_parser(
mut reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
let (sender, receiver) = channel();
let handle1 = thread::spawn(move || loop {
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
progress_bar.set_position(reader.buffer_position());
sender.send(events).unwrap();
});
receiver.into_iter().par_bridge().for_each(|event_list| {
let entry = parse_entry_from_events(event_list, &resolver);
// println!("{:?}", entry);
});
handle1.join();
Ok(())
}
fn parse_xml(file_path: &str) -> anyhow::Result<Vec<Entry>> {
let mut buf_reader = BufReader::new(File::open(file_path)?);
let total_bytes = buf_reader.seek(SeekFrom::End(0))?;
buf_reader.seek(SeekFrom::Start(0))?;
let mut reader = Reader::from_reader(buf_reader);
let mut buf = Vec::new();
let progress_bar = ProgressBar::new(total_bytes);
progress_bar.set_style(
ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")?
.progress_chars("#>-"),
);
// Loop until we find the <JMdict> tag,
// collect entity definitions along the way
let mut resolver = DocTypeEntityResolver::default();
loop {
match reader.read_event_into(&mut buf)? {
Event::Start(ref e) => {
progress_bar.set_position(reader.buffer_position());
if e.name().as_ref() == b"JMdict" {
break;
}
buf.clear();
}
Event::DocType(e) => {
resolver.capture(e).unwrap();
}
_ => {}
}
}
let now = Instant::now();
// multithreaded_parser(reader, progress_bar, resolver);
multithreaded_rayon_parser(reader, progress_bar, resolver);
// singlethreaded_parser(reader, progress_bar, resolver);
// single_threaded_event_list_parser(reader, progress_bar, resolver);
let elapsed = now.elapsed();
println!("Elapsed: {:?}", elapsed);
Ok(vec![])
}