Initial commit
This commit is contained in:
100
src/data.rs
Normal file
100
src/data.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
use fake::Dummy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct KanjiElement {
|
||||
#[serde(rename = "keb")]
|
||||
reading: String,
|
||||
|
||||
// news: Option<u8>,
|
||||
// ichi: Option<u8>,
|
||||
// spec: Option<u8>,
|
||||
// gai: Option<u8>,
|
||||
// nf: Option<u8>,
|
||||
|
||||
#[serde(rename = "ke_pri", default)]
|
||||
priorities: Vec<String>,
|
||||
|
||||
#[serde(rename = "ke_inf", default)]
|
||||
info: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct ReadingElement {
|
||||
#[serde(rename = "reb")]
|
||||
reading: String,
|
||||
|
||||
// news: Option<u8>,
|
||||
// ichi: Option<u8>,
|
||||
// spec: Option<u8>,
|
||||
// gai: Option<u8>,
|
||||
// nf: Option<u8>,
|
||||
|
||||
#[serde(rename = "re_pri", default)]
|
||||
priorities: Vec<String>,
|
||||
|
||||
#[serde(rename = "re_inf", default)]
|
||||
info: Vec<String>,
|
||||
|
||||
#[serde(rename = "re_restr", default)]
|
||||
restrictions: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct LanguageSource {
|
||||
language: String,
|
||||
phrase: Option<String>,
|
||||
fully_describes_sense: bool,
|
||||
constructed_from_smaller_words: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct Glossary {
|
||||
language: String,
|
||||
phrase: String,
|
||||
r#type: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct XRefParts {
|
||||
kanji_ref: Option<String>,
|
||||
reading_ref: Option<String>,
|
||||
sense_num: Option<i32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct XRef {
|
||||
entry_id: String,
|
||||
reading: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct Sense {
|
||||
id: u64,
|
||||
antonyms: Vec<XRefParts>,
|
||||
dialects: Vec<String>,
|
||||
fields: Vec<String>,
|
||||
info: Vec<String>,
|
||||
language_source: Vec<LanguageSource>,
|
||||
glossary: Vec<Glossary>,
|
||||
misc: Vec<String>,
|
||||
pos: Vec<String>,
|
||||
restricted_to_kanji: Vec<String>,
|
||||
restricted_to_reading: Vec<String>,
|
||||
see_also: Vec<XRefParts>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Dummy, Serialize, Deserialize)]
|
||||
pub struct Entry {
|
||||
#[serde(rename = "ent_seq", default)]
|
||||
id: u64,
|
||||
|
||||
#[serde(rename = "k_ele", default)]
|
||||
kanji: Vec<KanjiElement>,
|
||||
|
||||
#[serde(rename = "r_ele", default)]
|
||||
readings: Vec<ReadingElement>,
|
||||
|
||||
#[serde(rename = "sense", default)]
|
||||
senses: Vec<()>,
|
||||
}
|
||||
46
src/doctype_entity_resolver.rs
Normal file
46
src/doctype_entity_resolver.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
use quick_xml::{de::EntityResolver, events::BytesText};
|
||||
use regex::bytes::Regex;
|
||||
use std::{collections::BTreeMap, string::FromUtf8Error};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DocTypeEntityResolver {
|
||||
re: Regex,
|
||||
map: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
impl Default for DocTypeEntityResolver {
|
||||
fn default() -> Self {
|
||||
let mut map = BTreeMap::new();
|
||||
|
||||
map.insert("lt".to_string(), "<".to_string());
|
||||
map.insert("gt".to_string(), ">".to_string());
|
||||
map.insert("amp".to_string(), "&".to_string());
|
||||
map.insert("apos".to_string(), "'".to_string());
|
||||
map.insert("quot".to_string(), "\"".to_string());
|
||||
|
||||
Self {
|
||||
// We do not focus on true parsing in this example
|
||||
// You should use special libraries to parse DTD
|
||||
re: Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#).unwrap(),
|
||||
map,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EntityResolver for DocTypeEntityResolver {
|
||||
type Error = FromUtf8Error;
|
||||
|
||||
fn capture(&mut self, doctype: BytesText) -> Result<(), Self::Error> {
|
||||
for cap in self.re.captures_iter(&doctype) {
|
||||
self.map.insert(
|
||||
String::from_utf8(cap[1].to_vec())?,
|
||||
String::from_utf8(cap[2].to_vec())?,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn resolve(&self, entity: &str) -> Option<&str> {
|
||||
self.map.get(entity).map(|s| s.as_str())
|
||||
}
|
||||
}
|
||||
91
src/event_queue_reader.rs
Normal file
91
src/event_queue_reader.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use quick_xml::{de::{PayloadEvent, XmlRead}, events::Event, DeError, Decoder};
|
||||
|
||||
/// from raw events to semi-trimmed events that is independent from a way of
|
||||
/// events reading.
|
||||
struct StartTrimmer {
|
||||
/// If `true`, then leading whitespace will be removed from next returned
|
||||
/// [`Event::Text`]. This field is set to `true` after reading each event
|
||||
/// except [`Event::Text`] and [`Event::CData`], so [`Event::Text`] events
|
||||
/// read right after them does not trimmed.
|
||||
trim_start: bool,
|
||||
}
|
||||
|
||||
impl StartTrimmer {
|
||||
/// Converts raw reader's event into a payload event.
|
||||
/// Returns `None`, if event should be skipped.
|
||||
#[inline(always)]
|
||||
fn trim<'a>(&mut self, event: Event<'a>) -> Option<PayloadEvent<'a>> {
|
||||
let (event, trim_next_event) = match event {
|
||||
Event::DocType(e) => (PayloadEvent::DocType(e), true),
|
||||
Event::Start(e) => (PayloadEvent::Start(e), true),
|
||||
Event::End(e) => (PayloadEvent::End(e), true),
|
||||
Event::Eof => (PayloadEvent::Eof, true),
|
||||
|
||||
// Do not trim next text event after Text or CDATA event
|
||||
Event::CData(e) => (PayloadEvent::CData(e), false),
|
||||
Event::Text(mut e) => {
|
||||
// If event is empty after trimming, skip it
|
||||
if self.trim_start && e.inplace_trim_start() {
|
||||
return None;
|
||||
}
|
||||
(PayloadEvent::Text(e), false)
|
||||
}
|
||||
|
||||
_ => return None,
|
||||
};
|
||||
self.trim_start = trim_next_event;
|
||||
Some(event)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StartTrimmer {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self { trim_start: true }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EventReader<'i> {
|
||||
events: VecDeque<Event<'i>>,
|
||||
start_trimmer: StartTrimmer,
|
||||
}
|
||||
|
||||
impl<'i> EventReader<'i> {
|
||||
pub fn new(events: VecDeque<Event<'i>>) -> Self {
|
||||
Self {
|
||||
events,
|
||||
start_trimmer: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'i> XmlRead<'i> for EventReader<'i> {
|
||||
fn next(&mut self) -> Result<quick_xml::de::PayloadEvent<'i>, DeError> {
|
||||
loop {
|
||||
// TODO: fix the returned error
|
||||
let event = self.events.pop_front().ok_or(DeError::UnexpectedEof)?;
|
||||
// let event = self.events
|
||||
if let Some(event) = self.start_trimmer.trim(event) {
|
||||
return Ok(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn read_to_end(&mut self, name: quick_xml::name::QName) -> Result<(), DeError> {
|
||||
while let Some(event) = self.events.pop_front() {
|
||||
if let Event::End(ref e) = event {
|
||||
if e.name() == name {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(DeError::UnexpectedEof)
|
||||
}
|
||||
|
||||
fn decoder(&self) -> Decoder {
|
||||
Decoder { }
|
||||
}
|
||||
}
|
||||
252
src/main.rs
Normal file
252
src/main.rs
Normal file
@@ -0,0 +1,252 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
fs::File,
|
||||
io::{BufRead, BufReader, Seek, SeekFrom},
|
||||
thread,
|
||||
time::Instant,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
|
||||
use data::Entry;
|
||||
use doctype_entity_resolver::DocTypeEntityResolver;
|
||||
use event_queue_reader::EventReader;
|
||||
use fake::{Fake, Faker};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use quick_xml::{
|
||||
de::{Deserializer, EntityResolver},
|
||||
events::Event,
|
||||
DeError, Reader,
|
||||
};
|
||||
use rayon::iter::{ParallelBridge, ParallelIterator};
|
||||
use serde::Deserialize;
|
||||
|
||||
use std::sync::mpsc::channel;
|
||||
|
||||
mod data;
|
||||
mod doctype_entity_resolver;
|
||||
mod event_queue_reader;
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
cmd: Command,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
enum Command {
|
||||
#[command()]
|
||||
Parse(ParseArgs),
|
||||
|
||||
#[command()]
|
||||
Generate,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct ParseArgs {
|
||||
#[arg(short, long)]
|
||||
file: String,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Cli::parse();
|
||||
env_logger::init();
|
||||
match args.cmd {
|
||||
Command::Parse(parse) => {
|
||||
log::info!("Parsing file: {}", parse.file);
|
||||
parse_xml(&parse.file).unwrap();
|
||||
}
|
||||
Command::Generate => {
|
||||
log::info!("Generating random data");
|
||||
|
||||
let data: Vec<Entry> = (0..10).map(|_| Faker.fake()).collect();
|
||||
let xml = quick_xml::se::to_string_with_root("data", &data).unwrap();
|
||||
println!("{}", xml);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: can we avoid into_owned here?
|
||||
fn parse_events_for_tag<'i, R>(
|
||||
reader: &mut Reader<R>,
|
||||
tagname: &str,
|
||||
) -> Result<VecDeque<Event<'i>>, anyhow::Error>
|
||||
where
|
||||
R: BufRead,
|
||||
{
|
||||
let mut events = VecDeque::new();
|
||||
|
||||
loop {
|
||||
let mut buf = Vec::new();
|
||||
let event = { reader.read_event_into(&mut buf)? };
|
||||
match event {
|
||||
Event::Start(ref e) if e.name().as_ref() == tagname.as_bytes() => {
|
||||
events.push_back(event.into_owned());
|
||||
break;
|
||||
}
|
||||
Event::Eof => {
|
||||
// TODO: fix
|
||||
anyhow::bail!("EOF");
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let mut buf = Vec::new();
|
||||
let event = reader.read_event_into(&mut buf)?;
|
||||
match event {
|
||||
Event::End(ref e) if e.name().as_ref() == tagname.as_bytes() => {
|
||||
events.push_back(event.into_owned());
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
events.push_back(event.into_owned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(events)
|
||||
}
|
||||
|
||||
fn parse_entry_from_events(
|
||||
events: VecDeque<Event<'_>>,
|
||||
resolver: &DocTypeEntityResolver,
|
||||
) -> anyhow::Result<Entry> {
|
||||
let reader = EventReader::new(events);
|
||||
// NOTE: this constructor is made `pub` in a local fork, this is not
|
||||
// `pub` with the upstream crate
|
||||
let mut deserializer = Deserializer::new(reader, resolver.clone());
|
||||
|
||||
Entry::deserialize(&mut deserializer).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
fn singlethreaded_parser(
|
||||
reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut buf_reader = reader.into_inner();
|
||||
let mut deserializer =
|
||||
quick_xml::de::Deserializer::with_resolver(&mut buf_reader, resolver.clone());
|
||||
loop {
|
||||
let entry = Entry::deserialize(&mut deserializer);
|
||||
if let Err(DeError::UnexpectedEof) = entry {
|
||||
break Ok(());
|
||||
}
|
||||
progress_bar.set_position(deserializer.get_ref().get_ref().buffer_position());
|
||||
}
|
||||
}
|
||||
|
||||
fn single_threaded_event_list_parser(
|
||||
mut reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let events = parse_events_for_tag(&mut reader, "entry")?;
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
let entry = parse_entry_from_events(events.clone(), &resolver);
|
||||
// println!("{:?}", entry);
|
||||
// println!("{:?}\n", events);
|
||||
}
|
||||
}
|
||||
|
||||
fn multithreaded_parser(
|
||||
mut reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
let (sender, receiver) = channel();
|
||||
|
||||
let handle1 = thread::spawn(move || loop {
|
||||
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
sender.send(events).unwrap();
|
||||
});
|
||||
|
||||
// TODO: more threads
|
||||
let handle2 = thread::spawn(move || {
|
||||
receiver.into_iter().for_each(|event_list| {
|
||||
let entry = parse_entry_from_events(event_list, &resolver);
|
||||
// println!("{:?}", entry);
|
||||
});
|
||||
});
|
||||
|
||||
println!("Waiting for threads to finish");
|
||||
handle1.join();
|
||||
println!("First thread finished");
|
||||
handle2.join();
|
||||
println!("Threads finished");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn multithreaded_rayon_parser(
|
||||
mut reader: Reader<BufReader<File>>,
|
||||
progress_bar: ProgressBar,
|
||||
resolver: DocTypeEntityResolver,
|
||||
) -> anyhow::Result<()> {
|
||||
let (sender, receiver) = channel();
|
||||
|
||||
let handle1 = thread::spawn(move || loop {
|
||||
let events = parse_events_for_tag(&mut reader, "entry").unwrap();
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
sender.send(events).unwrap();
|
||||
});
|
||||
|
||||
receiver.into_iter().par_bridge().for_each(|event_list| {
|
||||
let entry = parse_entry_from_events(event_list, &resolver);
|
||||
// println!("{:?}", entry);
|
||||
});
|
||||
|
||||
handle1.join();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_xml(file_path: &str) -> anyhow::Result<Vec<Entry>> {
|
||||
let mut buf_reader = BufReader::new(File::open(file_path)?);
|
||||
|
||||
let total_bytes = buf_reader.seek(SeekFrom::End(0))?;
|
||||
buf_reader.seek(SeekFrom::Start(0))?;
|
||||
|
||||
let mut reader = Reader::from_reader(buf_reader);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let progress_bar = ProgressBar::new(total_bytes);
|
||||
progress_bar.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")?
|
||||
.progress_chars("#>-"),
|
||||
);
|
||||
|
||||
// Loop until we find the <JMdict> tag,
|
||||
// collect entity definitions along the way
|
||||
let mut resolver = DocTypeEntityResolver::default();
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf)? {
|
||||
Event::Start(ref e) => {
|
||||
progress_bar.set_position(reader.buffer_position());
|
||||
if e.name().as_ref() == b"JMdict" {
|
||||
break;
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
Event::DocType(e) => {
|
||||
resolver.capture(e).unwrap();
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
// multithreaded_parser(reader, progress_bar, resolver);
|
||||
multithreaded_rayon_parser(reader, progress_bar, resolver);
|
||||
// singlethreaded_parser(reader, progress_bar, resolver);
|
||||
// single_threaded_event_list_parser(reader, progress_bar, resolver);
|
||||
let elapsed = now.elapsed();
|
||||
println!("Elapsed: {:?}", elapsed);
|
||||
|
||||
Ok(vec![])
|
||||
}
|
||||
Reference in New Issue
Block a user