Initial commit
This commit is contained in:
commit
02de85a585
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
/target
|
||||
|
||||
result
|
716
Cargo.lock
generated
Normal file
716
Cargo.lock
generated
Normal file
@ -0,0 +1,716 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
|
||||
dependencies = [
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.95"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||
|
||||
[[package]]
|
||||
name = "console"
|
||||
version = "0.15.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
|
||||
dependencies = [
|
||||
"encode_unicode",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"unicode-width",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deunicode"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00"
|
||||
|
||||
[[package]]
|
||||
name = "dummy"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b3ee4e39146145f7dd28e6c85ffdce489d93c0d9c88121063b8aacabbd9858d2"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||
|
||||
[[package]]
|
||||
name = "env_filter"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
|
||||
dependencies = [
|
||||
"log",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.11.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"env_filter",
|
||||
"humantime",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fake"
|
||||
version = "3.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "661cb0601b5f4050d1e65452c5b0ea555c0b3e88fb5ed7855906adc6c42523ef"
|
||||
dependencies = [
|
||||
"deunicode",
|
||||
"dummy",
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "ident_case"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "indicatif"
|
||||
version = "0.17.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
|
||||
dependencies = [
|
||||
"console",
|
||||
"number_prefix",
|
||||
"portable-atomic",
|
||||
"unicode-width",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.76"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.169"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "number_prefix"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.37.1"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.216"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.216"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "singlethreaded-sax-multithreaded-parse"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"env_logger",
|
||||
"fake",
|
||||
"indicatif",
|
||||
"log",
|
||||
"quick-xml",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.90"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"wasm-bindgen-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.59.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
20
Cargo.toml
Normal file
20
Cargo.toml
Normal file
@ -0,0 +1,20 @@
|
||||
[package]
|
||||
name = "singlethreaded-sax-multithreaded-parse"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.95"
|
||||
clap = { version = "4.5.23", features = ["derive"] }
|
||||
env_logger = "0.11.6"
|
||||
fake = { version = "3.0.1", features = ["derive"] }
|
||||
indicatif = "0.17.9"
|
||||
log = "0.4.22"
|
||||
# quick-xml = { version = "0.37.1", features = ["serde", "serialize"] }
|
||||
quick-xml = { path = "/home/h7x4/git/quick-xml", features = ["serde", "serialize"] }
|
||||
rayon = "1.10.0"
|
||||
regex = "1.11.1"
|
||||
serde = { version = "1.0.216", features = ["derive"] }
|
||||
|
||||
[profile.release]
|
||||
debug = 1
|
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
@ -0,0 +1,61 @@
|
||||
{
|
||||
"nodes": {
|
||||
"jmdict-src": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"narHash": "sha256-QhbMFVI/yEvz/xq5flhCt0rg5rHObLnXjo1bGNSGwa8=",
|
||||
"type": "file",
|
||||
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "file",
|
||||
"url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1734649271,
|
||||
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"jmdict-src": "jmdict-src",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"rust-overlay": "rust-overlay"
|
||||
}
|
||||
},
|
||||
"rust-overlay": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1734834660,
|
||||
"narHash": "sha256-bm8V+Cu8rWJA+vKQnc94mXTpSDgvedyoDKxTVi/uJfw=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "b070e6030118680977bc2388868c4b3963872134",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
61
flake.nix
Normal file
61
flake.nix
Normal file
@ -0,0 +1,61 @@
|
||||
{
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
|
||||
rust-overlay.url = "github:oxalica/rust-overlay";
|
||||
rust-overlay.inputs.nixpkgs.follows = "nixpkgs";
|
||||
|
||||
jmdict-src.url = "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz";
|
||||
jmdict-src.flake = false;
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, rust-overlay, jmdict-src }:
|
||||
let
|
||||
inherit (nixpkgs) lib;
|
||||
|
||||
systems = [
|
||||
"x86_64-linux"
|
||||
"aarch64-linux"
|
||||
"x86_64-darwin"
|
||||
"aarch64-darwin"
|
||||
];
|
||||
|
||||
forAllSystems = f: lib.genAttrs systems (system: let
|
||||
pkgs = import nixpkgs {
|
||||
inherit system;
|
||||
overlays = [
|
||||
(import rust-overlay)
|
||||
];
|
||||
};
|
||||
|
||||
rust-bin = rust-overlay.lib.mkRustBin { } pkgs.buildPackages;
|
||||
toolchain = rust-bin.stable.latest.default.override {
|
||||
extensions = [ "rust-src" "rust-analyzer" "rust-std" ];
|
||||
};
|
||||
in f system pkgs toolchain);
|
||||
in {
|
||||
devShells = forAllSystems (system: pkgs: toolchain: {
|
||||
default = pkgs.mkShell {
|
||||
nativeBuildInputs = [
|
||||
toolchain
|
||||
pkgs.cargo-flamegraph
|
||||
];
|
||||
|
||||
RUST_SRC_PATH = "${toolchain}/lib/rustlib/src/rust/library";
|
||||
};
|
||||
});
|
||||
|
||||
packages = forAllSystems (system: pkgs: toolchain: {
|
||||
jmdict = pkgs.runCommand "jmdict" {
|
||||
nativeBuildInputs = with pkgs; [
|
||||
gzip
|
||||
xmlformat
|
||||
];
|
||||
} ''
|
||||
mkdir -p "$out"
|
||||
gzip -dkc ${jmdict-src} > "$out/JMdict.xml"
|
||||
xmlformat -i "$out/JMdict.xml"
|
||||
'';
|
||||
});
|
||||
};
|
||||
}
|
100
src/data.rs
Normal file
100
src/data.rs
Normal file
@ -0,0 +1,100 @@
|
||||
use fake::Dummy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct KanjiElement {
|
||||
#[serde(rename = "keb")]
|
||||
reading: String,
|
||||
|
||||
// news: Option<u8>,
|
||||
// ichi: Option<u8>,
|
||||
// spec: Option<u8>,
|
||||
// gai: Option<u8>,
|
||||
// nf: Option<u8>,
|
||||
|
||||
#[serde(rename = "ke_pri", default)]
|
||||
priorities: Vec<String>,
|
||||
|
||||
#[serde(rename = "ke_inf", default)]
|
||||
info: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct ReadingElement {
|
||||
#[serde(rename = "reb")]
|
||||
reading: String,
|
||||
|
||||
// news: Option<u8>,
|
||||
// ichi: Option<u8>,
|
||||
// spec: Option<u8>,
|
||||
// gai: Option<u8>,
|
||||
// nf: Option<u8>,
|
||||
|
||||
#[serde(rename = "re_pri", default)]
|
||||
priorities: Vec<String>,
|
||||
|
||||
#[serde(rename = "re_inf", default)]
|
||||
info: Vec<String>,
|
||||
|
||||
#[serde(rename = "re_restr", default)]
|
||||
restrictions: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct LanguageSource {
|
||||
language: String,
|
||||
phrase: Option<String>,
|
||||
fully_describes_sense: bool,
|
||||
constructed_from_smaller_words: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct Glossary {
|
||||
language: String,
|
||||
phrase: String,
|
||||
r#type: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct XRefParts {
|
||||
kanji_ref: Option<String>,
|
||||
reading_ref: Option<String>,
|
||||
sense_num: Option<i32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct XRef {
|
||||
entry_id: String,
|
||||
reading: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct Sense {
|
||||
id: u64,
|
||||
antonyms: Vec<XRefParts>,
|
||||
dialects: Vec<String>,
|
||||
fields: Vec<String>,
|
||||
info: Vec<String>,
|
||||
language_source: Vec<LanguageSource>,
|
||||
glossary: Vec<Glossary>,
|
||||
misc: Vec<String>,
|
||||
pos: Vec<String>,
|
||||
restricted_to_kanji: Vec<String>,
|
||||
restricted_to_reading: Vec<String>,
|
||||
see_also: Vec<XRefParts>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct Entry {
|
||||
#[serde(rename = "ent_seq", default)]
|
||||
id: u64,
|
||||
|
||||
#[serde(rename = "k_ele", default)]
|
||||
kanji: Vec<KanjiElement>,
|
||||
|
||||
#[serde(rename = "r_ele", default)]
|
||||
readings: Vec<ReadingElement>,
|
||||
|
||||
#[serde(rename = "sense", default)]
|
||||
senses: Vec<()>,
|
||||
}
|
46
src/doctype_entity_resolver.rs
Normal file
46
src/doctype_entity_resolver.rs
Normal file
@ -0,0 +1,46 @@
|
||||
use quick_xml::{de::EntityResolver, events::BytesText};
|
||||
use regex::bytes::Regex;
|
||||
use std::{collections::BTreeMap, string::FromUtf8Error};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DocTypeEntityResolver {
|
||||
re: Regex,
|
||||
map: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
impl Default for DocTypeEntityResolver {
|
||||
fn default() -> Self {
|
||||
let mut map = BTreeMap::new();
|
||||
|
||||
map.insert("lt".to_string(), "<".to_string());
|
||||
map.insert("gt".to_string(), ">".to_string());
|
||||
map.insert("amp".to_string(), "&".to_string());
|
||||
map.insert("apos".to_string(), "'".to_string());
|
||||
map.insert("quot".to_string(), "\"".to_string());
|
||||
|
||||
Self {
|
||||
// We do not focus on true parsing in this example
|
||||
// You should use special libraries to parse DTD
|
||||
re: Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#).unwrap(),
|
||||
map,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EntityResolver for DocTypeEntityResolver {
|
||||
type Error = FromUtf8Error;
|
||||
|
||||
fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> {
|
||||
for cap in self.re.captures_iter(&doctype) {
|
||||
self.map.insert(
|
||||
String::from_utf8(cap[1].to_vec())?,
|
||||
String::from_utf8(cap[2].to_vec())?,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn resolve(&self, entity: &str) -> Option<&str> {
|
||||
self.map.get(entity).map(|s| s.as_str())
|
||||
}
|
||||
}
|
91
src/event_queue_reader.rs
Normal file
91
src/event_queue_reader.rs
Normal file
@ -0,0 +1,91 @@
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use quick_xml::{de::{PayloadEvent, XmlRead}, events::Event, DeError, Decoder};
|
||||
|
||||
/// from raw events to semi-trimmed events that is independent from a way of
|
||||
/// events reading.
|
||||
struct StartTrimmer {
|
||||
/// If `true`, then leading whitespace will be removed from next returned
|
||||
/// [`Event::Text`]. This field is set to `true` after reading each event
|
||||
/// except [`Event::Text`] and [`Event::CData`], so [`Event::Text`] events
|
||||
/// read right after them does not trimmed.
|
||||
trim_start: bool,
|
||||
}
|
||||
|
||||
impl StartTrimmer {
|
||||
/// Converts raw reader's event into a payload event.
|
||||
/// Returns `None`, if event should be skipped.
|
||||
#[inline(always)]
|
||||
fn trim<'a>(&mut self, event: Event<'a>) -> Option<PayloadEvent<'a>> {
|
||||
let (event, trim_next_event) = match event {
|
||||
Event::DocType(e) => (PayloadEvent::DocType(e), true),
|
||||
Event::Start(e) => (PayloadEvent::Start(e), true),
|
||||
Event::End(e) => (PayloadEvent::End(e), true),
|
||||
Event::Eof => (PayloadEvent::Eof, true),
|
||||
|
||||
// Do not trim next text event after Text or CDATA event
|
||||
Event::CData(e) => (PayloadEvent::CData(e), false),
|
||||
Event::Text(mut e) => {
|
||||
// If event is empty after trimming, skip it
|
||||
if self.trim_start && e.inplace_trim_start() {
|
||||
return None;
|
||||
}
|
||||
(PayloadEvent::Text(e), false)
|
||||
}
|
||||
|
||||
_ => return None,
|
||||
};
|
||||
self.trim_start = trim_next_event;
|
||||
Some(event)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StartTrimmer {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self { trim_start: true }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EventReader<'i> {
|
||||
events: VecDeque<Event<'i>>,
|
||||
start_trimmer: StartTrimmer,
|
||||
}
|
||||
|
||||
impl<'i> EventReader<'i> {
|
||||
pub fn new(events: VecDeque<Event<'i>>) -> Self {
|
||||
Self {
|
||||
events,
|
||||
start_trimmer: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'i> XmlRead<'i> for EventReader<'i> {
|
||||
fn next(&mut self) -> Result<quick_xml::de::PayloadEvent<'i>, DeError> {
|
||||
loop {
|
||||
// TODO: fix the returned error
|
||||
let event = self.events.pop_front().ok_or(DeError::UnexpectedEof)?;
|
||||
// let event = self.events
|
||||
if let Some(event) = self.start_trimmer.trim(event) {
|
||||
return Ok(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn read_to_end(&mut self, name: quick_xml::name::QName) -> Result<(), DeError> {
|
||||
while let Some(event) = self.events.pop_front() {
|
||||
if let Event::End(ref e) = event {
|
||||
if e.name() == name {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(DeError::UnexpectedEof)
|
||||
}
|
||||
|
||||
fn decoder(&self) -> Decoder {
|
||||
Decoder { }
|
||||
}
|
||||
}
|
252
src/main.rs
Normal file
252
src/main.rs
Normal file
@ -0,0 +1,252 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
fs::File,
|
||||
io::{BufRead, BufReader, Seek, SeekFrom},
|
||||
thread,
|
||||
time::Instant,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
|
||||
use data::Entry;
|
||||
use doctype_entity_resolver::DocTypeEntityResolver;
|
||||
use event_queue_reader::EventReader;
|
||||
use fake::{Fake, Faker};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use quick_xml::{
|
||||
de::{Deserializer, EntityResolver},
|
||||
events::Event,
|
||||
DeError, Reader,
|
||||
};
|
||||
use rayon::iter::{ParallelBridge, ParallelIterator};
|
||||
use serde::Deserialize;
|
||||
|
||||
use std::sync::mpsc::channel;
|
||||
|
||||
mod data;
|
||||
mod doctype_entity_resolver;
|
||||
mod event_queue_reader;
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
cmd: Command,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
enum Command {
|
||||
#[command()]
|
||||
Parse(ParseArgs),
|
||||
|
||||
#[command()]
|
||||
Generate,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct ParseArgs {
|
||||
#[arg(short, long)]
|
||||
file: String,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Cli::parse();
|
||||
env_logger::init();
|
||||
match args.cmd {
|
||||
Command::Parse(parse) => {
|
||||
log::info!("Parsing file: {}", parse.file);
|
||||
parse_xml(&parse.file).unwrap();
|
||||
}
|
||||
Command::Generate => {
|
||||
log::info!("Generating random data");
|
||||
|
||||
let data: Vec<Entry> = (0..10).map(|_| Faker.fake()).collect();
|
||||
let xml = quick_xml::se::to_string_with_root("data", &data).unwrap();
|
||||
println!("{}", xml);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: can we avoid into_owned here?
|
||||
fn parse_events_for_tag<'i, R>(
|
||||
reader: &mut Reader<R>,
|
||||
tagname: &str,
|
||||
) -> Result<VecDeque<Event<'i>>, anyhow::Error>
|
||||
where
|
||||
R: BufRead,
|
||||
{
|
||||
let mut events = VecDeque::new();
|
||||
|
||||
loop {
|
||||
let mut buf = Vec::new();
|
||||
let event = { reader.read_event_into(&mut buf)? };
|
||||
match event {
|
||||
Event::Start(ref e) if e.name().as_ref() == tagname.as_bytes() => {
|
||||
events.push_back(event.into_owned());
|
||||
break;
|
||||
}
|
||||
Event::Eof => {
|
||||
// TODO: fix
|
||||
anyhow::bail!("EOF");
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let mut buf = Vec::new();
|
||||
let event = reader.read_event_into(&mut buf)?;
|
||||
match event {
|
||||
Event::End(ref e) if e.name().as_ref() == tagname.as_bytes() => {
|
||||
events.push_back(event.into_owned());
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
events.push_back(event.into_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(events)
|
||||
}
|
||||
|
||||
fn parse_entry_from_events(
|
||||
events: VecDeque<Event<'_>>,
|
||||
resolver: &DocTypeEntityResolver,
|
||||
) -> anyhow::Result<Entry> {
|
||||
let reader = EventReader::new(events);
|
||||
// NOTE: this constructor is made `pub` in a local fork, this is not
|
||||
// `pub` with the upstream crate
|
||||
let mut deserializer = Deserializer::new(reader, resolver.clone());
|
||||
|
||||
Entry::deserialize(&mut deserializer).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
fn singlethreaded_parser(
|
||||
reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut buf_reader = reader.into_inner();
|
||||
let mut deserializer =
|
||||
quick_xml::de::Deserializer::with_resolver(&mut buf_reader, resolver.clone());
|
||||
loop {
|
||||
let entry = Entry::deserialize(&mut deserializer);
|
||||
if let Err(DeError::UnexpectedEof) = entry {
|
||||
break Ok(());
|
||||
}
|
||||
progress_bar.set_position(deserializer.get_ref().get_ref().buffer_position());
|
||||
}
|
||||
}
|
||||
|
||||
fn single_threaded_event_list_parser(
|
||||
mut reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let events = parse_events_for_tag(&mut reader, "entry")?;
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
let entry = parse_entry_from_events(events.clone(), &resolver);
|
||||
// println!("{:?}", entry);
|
||||
// println!("{:?}\n", events);
|
||||
}
|
||||
}
|
||||
|
||||
fn multithreaded_parser(
|
||||
mut reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
let (sender, receiver) = channel();
|
||||
|
||||
let handle1 = thread::spawn(move || loop {
|
||||
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
sender.send(events).unwrap();
|
||||
});
|
||||
|
||||
// TODO: more threads
|
||||
let handle2 = thread::spawn(move || {
|
||||
receiver.into_iter().for_each(|event_list| {
|
||||
let entry = parse_entry_from_events(event_list, &resolver);
|
||||
// println!("{:?}", entry);
|
||||
});
|
||||
});
|
||||
|
||||
println!("Waiting for threads to finish");
|
||||
handle1.join();
|
||||
println!("First thread finished");
|
||||
handle2.join();
|
||||
println!("Threads finished");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn multithreaded_rayon_parser(
|
||||
mut reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
let (sender, receiver) = channel();
|
||||
|
||||
let handle1 = thread::spawn(move || loop {
|
||||
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
sender.send(events).unwrap();
|
||||
});
|
||||
|
||||
receiver.into_iter().par_bridge().for_each(|event_list| {
|
||||
let entry = parse_entry_from_events(event_list, &resolver);
|
||||
// println!("{:?}", entry);
|
||||
});
|
||||
|
||||
handle1.join();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_xml(file_path: &str) -> anyhow::Result<Vec<Entry>> {
|
||||
let mut buf_reader = BufReader::new(File::open(file_path)?);
|
||||
|
||||
let total_bytes = buf_reader.seek(SeekFrom::End(0))?;
|
||||
buf_reader.seek(SeekFrom::Start(0))?;
|
||||
|
||||
let mut reader = Reader::from_reader(buf_reader);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let progress_bar = ProgressBar::new(total_bytes);
|
||||
progress_bar.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")?
|
||||
.progress_chars("#>-"),
|
||||
);
|
||||
|
||||
// Loop until we find the <JMdict> tag,
|
||||
// collect entity definitions along the way
|
||||
let mut resolver = DocTypeEntityResolver::default();
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf)? {
|
||||
Event::Start(ref e) => {
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
if e.name().as_ref() == b"JMdict" {
|
||||
break;
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
Event::DocType(e) => {
|
||||
resolver.capture(e).unwrap();
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
// multithreaded_parser(reader, progress_bar, resolver);
|
||||
multithreaded_rayon_parser(reader, progress_bar, resolver);
|
||||
// singlethreaded_parser(reader, progress_bar, resolver);
|
||||
// single_threaded_event_list_parser(reader, progress_bar, resolver);
|
||||
let elapsed = now.elapsed();
|
||||
println!("Elapsed: {:?}", elapsed);
|
||||
|
||||
Ok(vec![])
|
||||
}
|
Loading…
Reference in New Issue
Block a user