From 02de85a585448db47297f33157ac56689448336c Mon Sep 17 00:00:00 2001 From: h7x4 Date: Thu, 26 Dec 2024 00:16:24 +0100 Subject: [PATCH] Initial commit --- .gitignore | 3 + Cargo.lock | 716 +++++++++++++++++++++++++++++++++ Cargo.toml | 20 + flake.lock | 61 +++ flake.nix | 61 +++ src/data.rs | 100 +++++ src/doctype_entity_resolver.rs | 46 +++ src/event_queue_reader.rs | 91 +++++ src/main.rs | 252 ++++++++++++ 9 files changed, 1350 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 src/data.rs create mode 100644 src/doctype_entity_resolver.rs create mode 100644 src/event_queue_reader.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0812318 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target + +result \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..c8e86a5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,716 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "deunicode" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" + +[[package]] +name = "dummy" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ee4e39146145f7dd28e6c85ffdce489d93c0d9c88121063b8aacabbd9858d2" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + +[[package]] +name = "fake" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "661cb0601b5f4050d1e65452c5b0ea555c0b3e88fb5ed7855906adc6c42523ef" +dependencies = [ + "deunicode", + "dummy", + "rand", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "indicatif" +version = "0.17.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "js-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.37.1" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "serde" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "singlethreaded-sax-multithreaded-parse" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "env_logger", + "fake", + "indicatif", + "log", + "quick-xml", + "rayon", + "regex", + "serde", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..a208ffb --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "singlethreaded-sax-multithreaded-parse" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.95" +clap = { version = "4.5.23", features = ["derive"] } +env_logger = "0.11.6" +fake = { version = "3.0.1", features = ["derive"] } +indicatif = "0.17.9" +log = "0.4.22" +# quick-xml = { version = "0.37.1", features = ["serde", "serialize"] } +quick-xml = { path = "/home/h7x4/git/quick-xml", features = ["serde", "serialize"] } +rayon = "1.10.0" +regex = "1.11.1" +serde = { version = "1.0.216", features = ["derive"] } + +[profile.release] +debug = 1 \ No newline at end of file diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..f8df3db --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "jmdict-src": { + "flake": false, + "locked": { + "narHash": "sha256-QhbMFVI/yEvz/xq5flhCt0rg5rHObLnXjo1bGNSGwa8=", + "type": "file", + "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz" + }, + "original": { + "type": "file", + "url": "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1734649271, + "narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "jmdict-src": "jmdict-src", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1734834660, + "narHash": "sha256-bm8V+Cu8rWJA+vKQnc94mXTpSDgvedyoDKxTVi/uJfw=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "b070e6030118680977bc2388868c4b3963872134", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..b4b3690 --- /dev/null +++ b/flake.nix @@ -0,0 +1,61 @@ +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + + rust-overlay.url = "github:oxalica/rust-overlay"; + rust-overlay.inputs.nixpkgs.follows = "nixpkgs"; + + jmdict-src.url = "http://ftp.edrdg.org/pub/Nihongo/JMdict.gz"; + jmdict-src.flake = false; + }; + + outputs = { self, nixpkgs, rust-overlay, jmdict-src }: + let + inherit (nixpkgs) lib; + + systems = [ + "x86_64-linux" + "aarch64-linux" + "x86_64-darwin" + "aarch64-darwin" + ]; + + forAllSystems = f: lib.genAttrs systems (system: let + pkgs = import nixpkgs { + inherit system; + overlays = [ + (import rust-overlay) + ]; + }; + + rust-bin = rust-overlay.lib.mkRustBin { } pkgs.buildPackages; + toolchain = rust-bin.stable.latest.default.override { + extensions = [ "rust-src" "rust-analyzer" "rust-std" ]; + }; + in f system pkgs toolchain); + in { + devShells = forAllSystems (system: pkgs: toolchain: { + default = pkgs.mkShell { + nativeBuildInputs = [ + toolchain + pkgs.cargo-flamegraph + ]; + + RUST_SRC_PATH = "${toolchain}/lib/rustlib/src/rust/library"; + }; + }); + + packages = forAllSystems (system: pkgs: toolchain: { + jmdict = pkgs.runCommand "jmdict" { + nativeBuildInputs = with pkgs; [ + gzip + xmlformat + ]; + } '' + mkdir -p "$out" + gzip -dkc ${jmdict-src} > "$out/JMdict.xml" + xmlformat -i "$out/JMdict.xml" + ''; + }); + }; +} diff --git a/src/data.rs b/src/data.rs new file mode 100644 index 0000000..0f0ce16 --- /dev/null +++ b/src/data.rs @@ -0,0 +1,100 @@ +use fake::Dummy; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct KanjiElement { + #[serde(rename = "keb")] + reading: String, + + // news: Option, + // ichi: Option, + // spec: Option, + // gai: Option, + // nf: Option, + + #[serde(rename = "ke_pri", default)] + priorities: Vec, + + #[serde(rename = "ke_inf", default)] + info: Vec, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct ReadingElement { + #[serde(rename = "reb")] + reading: String, + + // news: Option, + // ichi: Option, + // spec: Option, + // gai: Option, + // nf: Option, + + #[serde(rename = "re_pri", default)] + priorities: Vec, + + #[serde(rename = "re_inf", default)] + info: Vec, + + #[serde(rename = "re_restr", default)] + restrictions: Vec, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct LanguageSource { + language: String, + phrase: Option, + fully_describes_sense: bool, + constructed_from_smaller_words: bool, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct Glossary { + language: String, + phrase: String, + r#type: Option, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct XRefParts { + kanji_ref: Option, + reading_ref: Option, + sense_num: Option, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct XRef { + entry_id: String, + reading: String, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct Sense { + id: u64, + antonyms: Vec, + dialects: Vec, + fields: Vec, + info: Vec, + language_source: Vec, + glossary: Vec, + misc: Vec, + pos: Vec, + restricted_to_kanji: Vec, + restricted_to_reading: Vec, + see_also: Vec, +} + +#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)] +pub struct Entry { + #[serde(rename = "ent_seq", default)] + id: u64, + + #[serde(rename = "k_ele", default)] + kanji: Vec, + + #[serde(rename = "r_ele", default)] + readings: Vec, + + #[serde(rename = "sense", default)] + senses: Vec<()>, +} diff --git a/src/doctype_entity_resolver.rs b/src/doctype_entity_resolver.rs new file mode 100644 index 0000000..4c948df --- /dev/null +++ b/src/doctype_entity_resolver.rs @@ -0,0 +1,46 @@ +use quick_xml::{de::EntityResolver, events::BytesText}; +use regex::bytes::Regex; +use std::{collections::BTreeMap, string::FromUtf8Error}; + +#[derive(Debug, Clone)] +pub struct DocTypeEntityResolver { + re: Regex, + map: BTreeMap, +} + +impl Default for DocTypeEntityResolver { + fn default() -> Self { + let mut map = BTreeMap::new(); + + map.insert("lt".to_string(), "<".to_string()); + map.insert("gt".to_string(), ">".to_string()); + map.insert("amp".to_string(), "&".to_string()); + map.insert("apos".to_string(), "'".to_string()); + map.insert("quot".to_string(), "\"".to_string()); + + Self { + // We do not focus on true parsing in this example + // You should use special libraries to parse DTD + re: Regex::new(r#""#).unwrap(), + map, + } + } +} + +impl EntityResolver for DocTypeEntityResolver { + type Error = FromUtf8Error; + + fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> { + for cap in self.re.captures_iter(&doctype) { + self.map.insert( + String::from_utf8(cap[1].to_vec())?, + String::from_utf8(cap[2].to_vec())?, + ); + } + Ok(()) + } + + fn resolve(&self, entity: &str) -> Option<&str> { + self.map.get(entity).map(|s| s.as_str()) + } +} diff --git a/src/event_queue_reader.rs b/src/event_queue_reader.rs new file mode 100644 index 0000000..eb15775 --- /dev/null +++ b/src/event_queue_reader.rs @@ -0,0 +1,91 @@ +use std::collections::VecDeque; + +use quick_xml::{de::{PayloadEvent, XmlRead}, events::Event, DeError, Decoder}; + +/// from raw events to semi-trimmed events that is independent from a way of +/// events reading. +struct StartTrimmer { + /// If `true`, then leading whitespace will be removed from next returned + /// [`Event::Text`]. This field is set to `true` after reading each event + /// except [`Event::Text`] and [`Event::CData`], so [`Event::Text`] events + /// read right after them does not trimmed. + trim_start: bool, +} + +impl StartTrimmer { + /// Converts raw reader's event into a payload event. + /// Returns `None`, if event should be skipped. + #[inline(always)] + fn trim<'a>(&mut self, event: Event<'a>) -> Option> { + let (event, trim_next_event) = match event { + Event::DocType(e) => (PayloadEvent::DocType(e), true), + Event::Start(e) => (PayloadEvent::Start(e), true), + Event::End(e) => (PayloadEvent::End(e), true), + Event::Eof => (PayloadEvent::Eof, true), + + // Do not trim next text event after Text or CDATA event + Event::CData(e) => (PayloadEvent::CData(e), false), + Event::Text(mut e) => { + // If event is empty after trimming, skip it + if self.trim_start && e.inplace_trim_start() { + return None; + } + (PayloadEvent::Text(e), false) + } + + _ => return None, + }; + self.trim_start = trim_next_event; + Some(event) + } +} + +impl Default for StartTrimmer { + #[inline] + fn default() -> Self { + Self { trim_start: true } + } +} + +pub struct EventReader<'i> { + events: VecDeque>, + start_trimmer: StartTrimmer, +} + +impl<'i> EventReader<'i> { + pub fn new(events: VecDeque>) -> Self { + Self { + events, + start_trimmer: Default::default(), + } + } +} + +impl<'i> XmlRead<'i> for EventReader<'i> { + fn next(&mut self) -> Result, DeError> { + loop { + // TODO: fix the returned error + let event = self.events.pop_front().ok_or(DeError::UnexpectedEof)?; + // let event = self.events + if let Some(event) = self.start_trimmer.trim(event) { + return Ok(event); + } + } + } + + fn read_to_end(&mut self, name: quick_xml::name::QName) -> Result<(), DeError> { + while let Some(event) = self.events.pop_front() { + if let Event::End(ref e) = event { + if e.name() == name { + return Ok(()); + } + } + } + + Err(DeError::UnexpectedEof) + } + + fn decoder(&self) -> Decoder { + Decoder { } + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..6be62b2 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,252 @@ +use std::{ + collections::VecDeque, + fs::File, + io::{BufRead, BufReader, Seek, SeekFrom}, + thread, + time::Instant, +}; + +use clap::Parser; + +use data::Entry; +use doctype_entity_resolver::DocTypeEntityResolver; +use event_queue_reader::EventReader; +use fake::{Fake, Faker}; +use indicatif::{ProgressBar, ProgressStyle}; +use quick_xml::{ + de::{Deserializer, EntityResolver}, + events::Event, + DeError, Reader, +}; +use rayon::iter::{ParallelBridge, ParallelIterator}; +use serde::Deserialize; + +use std::sync::mpsc::channel; + +mod data; +mod doctype_entity_resolver; +mod event_queue_reader; + +#[derive(Parser)] +struct Cli { + #[command(subcommand)] + cmd: Command, +} + +#[derive(Parser)] +enum Command { + #[command()] + Parse(ParseArgs), + + #[command()] + Generate, +} + +#[derive(Parser)] +struct ParseArgs { + #[arg(short, long)] + file: String, +} + +fn main() { + let args = Cli::parse(); + env_logger::init(); + match args.cmd { + Command::Parse(parse) => { + log::info!("Parsing file: {}", parse.file); + parse_xml(&parse.file).unwrap(); + } + Command::Generate => { + log::info!("Generating random data"); + + let data: Vec = (0..10).map(|_| Faker.fake()).collect(); + let xml = quick_xml::se::to_string_with_root("data", &data).unwrap(); + println!("{}", xml); + } + } +} + +// TODO: can we avoid into_owned here? +fn parse_events_for_tag<'i, R>( + reader: &mut Reader, + tagname: &str, +) -> Result>, anyhow::Error> +where + R: BufRead, +{ + let mut events = VecDeque::new(); + + loop { + let mut buf = Vec::new(); + let event = { reader.read_event_into(&mut buf)? }; + match event { + Event::Start(ref e) if e.name().as_ref() == tagname.as_bytes() => { + events.push_back(event.into_owned()); + break; + } + Event::Eof => { + // TODO: fix + anyhow::bail!("EOF"); + } + _ => {} + } + } + + loop { + let mut buf = Vec::new(); + let event = reader.read_event_into(&mut buf)?; + match event { + Event::End(ref e) if e.name().as_ref() == tagname.as_bytes() => { + events.push_back(event.into_owned()); + break; + } + _ => { + events.push_back(event.into_owned()); + } + } + } + + Ok(events) +} + +fn parse_entry_from_events( + events: VecDeque>, + resolver: &DocTypeEntityResolver, +) -> anyhow::Result { + let reader = EventReader::new(events); + // NOTE: this constructor is made `pub` in a local fork, this is not + // `pub` with the upstream crate + let mut deserializer = Deserializer::new(reader, resolver.clone()); + + Entry::deserialize(&mut deserializer).map_err(|e| e.into()) +} + +fn singlethreaded_parser( + reader: Reader>, + progress_bar: ProgressBar, + resolver: DocTypeEntityResolver, +) -> anyhow::Result<()> { + let mut buf_reader = reader.into_inner(); + let mut deserializer = + quick_xml::de::Deserializer::with_resolver(&mut buf_reader, resolver.clone()); + loop { + let entry = Entry::deserialize(&mut deserializer); + if let Err(DeError::UnexpectedEof) = entry { + break Ok(()); + } + progress_bar.set_position(deserializer.get_ref().get_ref().buffer_position()); + } +} + +fn single_threaded_event_list_parser( + mut reader: Reader>, + progress_bar: ProgressBar, + resolver: DocTypeEntityResolver, +) -> anyhow::Result<()> { + loop { + let events = parse_events_for_tag(&mut reader, "entry")?; + progress_bar.set_position(reader.buffer_position()); + let entry = parse_entry_from_events(events.clone(), &resolver); + // println!("{:?}", entry); + // println!("{:?}\n", events); + } +} + +fn multithreaded_parser( + mut reader: Reader>, + progress_bar: ProgressBar, + resolver: DocTypeEntityResolver, +) -> anyhow::Result<()> { + let (sender, receiver) = channel(); + + let handle1 = thread::spawn(move || loop { + let events = parse_events_for_tag(&mut reader, "entry").unwrap(); + progress_bar.set_position(reader.buffer_position()); + sender.send(events).unwrap(); + }); + + // TODO: more threads + let handle2 = thread::spawn(move || { + receiver.into_iter().for_each(|event_list| { + let entry = parse_entry_from_events(event_list, &resolver); + // println!("{:?}", entry); + }); + }); + + println!("Waiting for threads to finish"); + handle1.join(); + println!("First thread finished"); + handle2.join(); + println!("Threads finished"); + + Ok(()) +} + +fn multithreaded_rayon_parser( + mut reader: Reader>, + progress_bar: ProgressBar, + resolver: DocTypeEntityResolver, +) -> anyhow::Result<()> { + let (sender, receiver) = channel(); + + let handle1 = thread::spawn(move || loop { + let events = parse_events_for_tag(&mut reader, "entry").unwrap(); + progress_bar.set_position(reader.buffer_position()); + sender.send(events).unwrap(); + }); + + receiver.into_iter().par_bridge().for_each(|event_list| { + let entry = parse_entry_from_events(event_list, &resolver); + // println!("{:?}", entry); + }); + + handle1.join(); + + Ok(()) +} + +fn parse_xml(file_path: &str) -> anyhow::Result> { + let mut buf_reader = BufReader::new(File::open(file_path)?); + + let total_bytes = buf_reader.seek(SeekFrom::End(0))?; + buf_reader.seek(SeekFrom::Start(0))?; + + let mut reader = Reader::from_reader(buf_reader); + let mut buf = Vec::new(); + + let progress_bar = ProgressBar::new(total_bytes); + progress_bar.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")? + .progress_chars("#>-"), + ); + + // Loop until we find the tag, + // collect entity definitions along the way + let mut resolver = DocTypeEntityResolver::default(); + loop { + match reader.read_event_into(&mut buf)? { + Event::Start(ref e) => { + progress_bar.set_position(reader.buffer_position()); + if e.name().as_ref() == b"JMdict" { + break; + } + buf.clear(); + } + Event::DocType(e) => { + resolver.capture(e).unwrap(); + } + _ => {} + } + } + + let now = Instant::now(); + // multithreaded_parser(reader, progress_bar, resolver); + multithreaded_rayon_parser(reader, progress_bar, resolver); + // singlethreaded_parser(reader, progress_bar, resolver); + // single_threaded_event_list_parser(reader, progress_bar, resolver); + let elapsed = now.elapsed(); + println!("Elapsed: {:?}", elapsed); + + Ok(vec![]) +}