Initial commit

This commit is contained in:
2024-12-26 00:16:24 +01:00
commit 02de85a585
9 changed files with 1350 additions and 0 deletions

100
src/data.rs Normal file
View File

@@ -0,0 +1,100 @@
use fake::Dummy;
use serde::{Deserialize, Serialize};
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct KanjiElement {
#[serde(rename = "keb")]
reading: String,
// news: Option<u8>,
// ichi: Option<u8>,
// spec: Option<u8>,
// gai: Option<u8>,
// nf: Option<u8>,
#[serde(rename = "ke_pri", default)]
priorities: Vec<String>,
#[serde(rename = "ke_inf", default)]
info: Vec<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct ReadingElement {
#[serde(rename = "reb")]
reading: String,
// news: Option<u8>,
// ichi: Option<u8>,
// spec: Option<u8>,
// gai: Option<u8>,
// nf: Option<u8>,
#[serde(rename = "re_pri", default)]
priorities: Vec<String>,
#[serde(rename = "re_inf", default)]
info: Vec<String>,
#[serde(rename = "re_restr", default)]
restrictions: Vec<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct LanguageSource {
language: String,
phrase: Option<String>,
fully_describes_sense: bool,
constructed_from_smaller_words: bool,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct Glossary {
language: String,
phrase: String,
r#type: Option<String>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct XRefParts {
kanji_ref: Option<String>,
reading_ref: Option<String>,
sense_num: Option<i32>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct XRef {
entry_id: String,
reading: String,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct Sense {
id: u64,
antonyms: Vec<XRefParts>,
dialects: Vec<String>,
fields: Vec<String>,
info: Vec<String>,
language_source: Vec<LanguageSource>,
glossary: Vec<Glossary>,
misc: Vec<String>,
pos: Vec<String>,
restricted_to_kanji: Vec<String>,
restricted_to_reading: Vec<String>,
see_also: Vec<XRefParts>,
}
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
pub struct Entry {
#[serde(rename = "ent_seq", default)]
id: u64,
#[serde(rename = "k_ele", default)]
kanji: Vec<KanjiElement>,
#[serde(rename = "r_ele", default)]
readings: Vec<ReadingElement>,
#[serde(rename = "sense", default)]
senses: Vec<()>,
}

View File

@@ -0,0 +1,46 @@
use quick_xml::{de::EntityResolver, events::BytesText};
use regex::bytes::Regex;
use std::{collections::BTreeMap, string::FromUtf8Error};
#[derive(Debug, Clone)]
pub struct DocTypeEntityResolver {
re: Regex,
map: BTreeMap<String, String>,
}
impl Default for DocTypeEntityResolver {
fn default() -> Self {
let mut map = BTreeMap::new();
map.insert("lt".to_string(), "<".to_string());
map.insert("gt".to_string(), ">".to_string());
map.insert("amp".to_string(), "&".to_string());
map.insert("apos".to_string(), "'".to_string());
map.insert("quot".to_string(), "\"".to_string());
Self {
// We do not focus on true parsing in this example
// You should use special libraries to parse DTD
re: Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#).unwrap(),
map,
}
}
}
impl EntityResolver for DocTypeEntityResolver {
type Error = FromUtf8Error;
fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> {
for cap in self.re.captures_iter(&doctype) {
self.map.insert(
String::from_utf8(cap[1].to_vec())?,
String::from_utf8(cap[2].to_vec())?,
);
}
Ok(())
}
fn resolve(&self, entity: &str) -> Option<&str> {
self.map.get(entity).map(|s| s.as_str())
}
}

91
src/event_queue_reader.rs Normal file
View File

@@ -0,0 +1,91 @@
use std::collections::VecDeque;
use quick_xml::{de::{PayloadEvent, XmlRead}, events::Event, DeError, Decoder};
/// from raw events to semi-trimmed events that is independent from a way of
/// events reading.
struct StartTrimmer {
/// If `true`, then leading whitespace will be removed from next returned
/// [`Event::Text`]. This field is set to `true` after reading each event
/// except [`Event::Text`] and [`Event::CData`], so [`Event::Text`] events
/// read right after them does not trimmed.
trim_start: bool,
}
impl StartTrimmer {
/// Converts raw reader's event into a payload event.
/// Returns `None`, if event should be skipped.
#[inline(always)]
fn trim<'a>(&mut self, event: Event<'a>) -> Option<PayloadEvent<'a>> {
let (event, trim_next_event) = match event {
Event::DocType(e) => (PayloadEvent::DocType(e), true),
Event::Start(e) => (PayloadEvent::Start(e), true),
Event::End(e) => (PayloadEvent::End(e), true),
Event::Eof => (PayloadEvent::Eof, true),
// Do not trim next text event after Text or CDATA event
Event::CData(e) => (PayloadEvent::CData(e), false),
Event::Text(mut e) => {
// If event is empty after trimming, skip it
if self.trim_start && e.inplace_trim_start() {
return None;
}
(PayloadEvent::Text(e), false)
}
_ => return None,
};
self.trim_start = trim_next_event;
Some(event)
}
}
impl Default for StartTrimmer {
#[inline]
fn default() -> Self {
Self { trim_start: true }
}
}
pub struct EventReader<'i> {
events: VecDeque<Event<'i>>,
start_trimmer: StartTrimmer,
}
impl<'i> EventReader<'i> {
pub fn new(events: VecDeque<Event<'i>>) -> Self {
Self {
events,
start_trimmer: Default::default(),
}
}
}
impl<'i> XmlRead<'i> for EventReader<'i> {
fn next(&mut self) -> Result<quick_xml::de::PayloadEvent<'i>, DeError> {
loop {
// TODO: fix the returned error
let event = self.events.pop_front().ok_or(DeError::UnexpectedEof)?;
// let event = self.events
if let Some(event) = self.start_trimmer.trim(event) {
return Ok(event);
}
}
}
fn read_to_end(&mut self, name: quick_xml::name::QName) -> Result<(), DeError> {
while let Some(event) = self.events.pop_front() {
if let Event::End(ref e) = event {
if e.name() == name {
return Ok(());
}
}
}
Err(DeError::UnexpectedEof)
}
fn decoder(&self) -> Decoder {
Decoder { }
}
}

252
src/main.rs Normal file
View File

@@ -0,0 +1,252 @@
use std::{
collections::VecDeque,
fs::File,
io::{BufRead, BufReader, Seek, SeekFrom},
thread,
time::Instant,
};
use clap::Parser;
use data::Entry;
use doctype_entity_resolver::DocTypeEntityResolver;
use event_queue_reader::EventReader;
use fake::{Fake, Faker};
use indicatif::{ProgressBar, ProgressStyle};
use quick_xml::{
de::{Deserializer, EntityResolver},
events::Event,
DeError, Reader,
};
use rayon::iter::{ParallelBridge, ParallelIterator};
use serde::Deserialize;
use std::sync::mpsc::channel;
mod data;
mod doctype_entity_resolver;
mod event_queue_reader;
#[derive(Parser)]
struct Cli {
#[command(subcommand)]
cmd: Command,
}
#[derive(Parser)]
enum Command {
#[command()]
Parse(ParseArgs),
#[command()]
Generate,
}
#[derive(Parser)]
struct ParseArgs {
#[arg(short, long)]
file: String,
}
fn main() {
let args = Cli::parse();
env_logger::init();
match args.cmd {
Command::Parse(parse) => {
log::info!("Parsing file: {}", parse.file);
parse_xml(&parse.file).unwrap();
}
Command::Generate => {
log::info!("Generating random data");
let data: Vec<Entry> = (0..10).map(|_| Faker.fake()).collect();
let xml = quick_xml::se::to_string_with_root("data", &data).unwrap();
println!("{}", xml);
}
}
}
// TODO: can we avoid into_owned here?
fn parse_events_for_tag<'i, R>(
reader: &mut Reader<R>,
tagname: &str,
) -> Result<VecDeque<Event<'i>>, anyhow::Error>
where
R: BufRead,
{
let mut events = VecDeque::new();
loop {
let mut buf = Vec::new();
let event = { reader.read_event_into(&mut buf)? };
match event {
Event::Start(ref e) if e.name().as_ref() == tagname.as_bytes() => {
events.push_back(event.into_owned());
break;
}
Event::Eof => {
// TODO: fix
anyhow::bail!("EOF");
}
_ => {}
}
}
loop {
let mut buf = Vec::new();
let event = reader.read_event_into(&mut buf)?;
match event {
Event::End(ref e) if e.name().as_ref() == tagname.as_bytes() => {
events.push_back(event.into_owned());
break;
}
_ => {
events.push_back(event.into_owned());
}
}
}
Ok(events)
}
fn parse_entry_from_events(
events: VecDeque<Event<'_>>,
resolver: &DocTypeEntityResolver,
) -> anyhow::Result<Entry> {
let reader = EventReader::new(events);
// NOTE: this constructor is made `pub` in a local fork, this is not
// `pub` with the upstream crate
let mut deserializer = Deserializer::new(reader, resolver.clone());
Entry::deserialize(&mut deserializer).map_err(|e| e.into())
}
fn singlethreaded_parser(
reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
let mut buf_reader = reader.into_inner();
let mut deserializer =
quick_xml::de::Deserializer::with_resolver(&mut buf_reader, resolver.clone());
loop {
let entry = Entry::deserialize(&mut deserializer);
if let Err(DeError::UnexpectedEof) = entry {
break Ok(());
}
progress_bar.set_position(deserializer.get_ref().get_ref().buffer_position());
}
}
fn single_threaded_event_list_parser(
mut reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
loop {
let events = parse_events_for_tag(&mut reader, "entry")?;
progress_bar.set_position(reader.buffer_position());
let entry = parse_entry_from_events(events.clone(), &resolver);
// println!("{:?}", entry);
// println!("{:?}\n", events);
}
}
fn multithreaded_parser(
mut reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
let (sender, receiver) = channel();
let handle1 = thread::spawn(move || loop {
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
progress_bar.set_position(reader.buffer_position());
sender.send(events).unwrap();
});
// TODO: more threads
let handle2 = thread::spawn(move || {
receiver.into_iter().for_each(|event_list| {
let entry = parse_entry_from_events(event_list, &resolver);
// println!("{:?}", entry);
});
});
println!("Waiting for threads to finish");
handle1.join();
println!("First thread finished");
handle2.join();
println!("Threads finished");
Ok(())
}
fn multithreaded_rayon_parser(
mut reader: Reader<BufReader<File>>,
progress_bar: ProgressBar,
resolver: DocTypeEntityResolver,
) -> anyhow::Result<()> {
let (sender, receiver) = channel();
let handle1 = thread::spawn(move || loop {
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
progress_bar.set_position(reader.buffer_position());
sender.send(events).unwrap();
});
receiver.into_iter().par_bridge().for_each(|event_list| {
let entry = parse_entry_from_events(event_list, &resolver);
// println!("{:?}", entry);
});
handle1.join();
Ok(())
}
fn parse_xml(file_path: &str) -> anyhow::Result<Vec<Entry>> {
let mut buf_reader = BufReader::new(File::open(file_path)?);
let total_bytes = buf_reader.seek(SeekFrom::End(0))?;
buf_reader.seek(SeekFrom::Start(0))?;
let mut reader = Reader::from_reader(buf_reader);
let mut buf = Vec::new();
let progress_bar = ProgressBar::new(total_bytes);
progress_bar.set_style(
ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")?
.progress_chars("#>-"),
);
// Loop until we find the <JMdict> tag,
// collect entity definitions along the way
let mut resolver = DocTypeEntityResolver::default();
loop {
match reader.read_event_into(&mut buf)? {
Event::Start(ref e) => {
progress_bar.set_position(reader.buffer_position());
if e.name().as_ref() == b"JMdict" {
break;
}
buf.clear();
}
Event::DocType(e) => {
resolver.capture(e).unwrap();
}
_ => {}
}
}
let now = Instant::now();
// multithreaded_parser(reader, progress_bar, resolver);
multithreaded_rayon_parser(reader, progress_bar, resolver);
// singlethreaded_parser(reader, progress_bar, resolver);
// single_threaded_event_list_parser(reader, progress_bar, resolver);
let elapsed = now.elapsed();
println!("Elapsed: {:?}", elapsed);
Ok(vec![])
}